Transcript
Page 1: Building Hadoop Data Applications with Kite by Tom White

1 1

Headline(Goes(Here(Speaker(Name(or(Subhead(Goes(Here(

Building(Hadoop(Data(Applica;ons(with(Kite(

Tom(White(@tom_e_white(The(Hive,(February(18,(2014(

Page 2: Building Hadoop Data Applications with Kite by Tom White

Hadoop(0.1(

% cat bigdata.txt | hadoop fs -put - in!% hadoop MyJob in out!% hadoop fs -get out!

2

Page 3: Building Hadoop Data Applications with Kite by Tom White

Characteris;cs(

•  Batch(applica;ons(only(

•  LowNlevel(coding(

•  File(format(•  Serializa;on(

•  Par;;oning(scheme(

3

Page 4: Building Hadoop Data Applications with Kite by Tom White

A(Hadoop(Stack(

4

Page 5: Building Hadoop Data Applications with Kite by Tom White

Applica;ons(

•  [Batch](Analyze(an(archive(of(songs1(

•  [Interac;ve(SQL](Ad(hoc(queries(on(recommenda;ons(from(social(media(applica;ons2(

•  [Search](Searching(email(traffic(in(nearNreal;me3(

•  [ML](Detec;ng(fraudulent(transac;ons(using(clustering4(

5

[1](hZp://blog.cloudera.com/blog/2012/08/processNaNmillionNsongsNwithNapacheNpig/(([2](hZp://blog.cloudera.com/blog/2014/01/howNwajamNanswersNbusinessNques;onsNfasterNwithNhadoop/(([3](hZp://blog.cloudera.com/blog/2013/09/emailNindexingNusingNclouderaNsearch/(([4](hZp://blog.cloudera.com/blog/2013/03/cloudera_ml_data_science_tools/((

Page 6: Building Hadoop Data Applications with Kite by Tom White

Outline(

• A(Typical(Applica;on(

•  Kite(SDK(

• An(Example(• Advanced(Kite(

•  Conclusion(

• Ques;ons(

6

Page 7: Building Hadoop Data Applications with Kite by Tom White

A(typical(applica;on((zoom(100:1)(

7

Page 8: Building Hadoop Data Applications with Kite by Tom White

A(typical(applica;on((zoom(10:1)(

8

Page 9: Building Hadoop Data Applications with Kite by Tom White

A(typical(pipeline((zoom(5:1)(

9

Page 10: Building Hadoop Data Applications with Kite by Tom White

Kite(SDK(

10

Page 11: Building Hadoop Data Applications with Kite by Tom White

Kite(Codifies(Best(Prac;ce(as(APIs,(Tools,(Docs(and(Examples(

11

Page 12: Building Hadoop Data Applications with Kite by Tom White

Kite(

• A(clientNside(library(for(wri;ng(Hadoop(Data(Applica;ons(

•  First(release(was(in(April(2013(as(CDK(

•  0.11.0(earlier(this(month(• Open(source,(Apache(2(license,(kitesdk.org(

• Modular(

• Data(module((HDFS,(Flume,(Crunch,(Hive,(HBase)(

• Morphlines(transforma;on(module(

• Maven(plugin(

12

Page 13: Building Hadoop Data Applications with Kite by Tom White

An(Example(

13

Page 14: Building Hadoop Data Applications with Kite by Tom White

Kite(Data(Module(

• Dataset(–(a(collec;on(of(en;;es(

• DatasetRepository(–(physical(storage(loca;on(for(datasets(

• DatasetDescriptor(–(holds(dataset(metadata((schema,(format)(• DatasetWriter(–(write(en;;es(to(a(dataset(in(a(stream(

• DatasetReader(–(read(en;;es(from(a(dataset((

14

Page 15: Building Hadoop Data Applications with Kite by Tom White

1.(Define(the(Event(En;ty(

public class Event {! private long id;! private long timestamp;! private String source;!

// getters and setters!}!

15

Page 16: Building Hadoop Data Applications with Kite by Tom White

2.(Create(the(Events(Dataset(

DatasetRepository repo = DatasetRepositories.open("repo:hive");!

DatasetDescriptor descriptor =! new DatasetDescriptor.Builder()! .schema(Event.class).build();!

repo.create("events", descriptor);!

16

Page 17: Building Hadoop Data Applications with Kite by Tom White

(2.(or(with(the(Maven(plugin)(

$ mvn kite:create-dataset \! -Dkite.repositoryUri='repo:hive' \! -Dkite.datasetName=events \! -Dkite.avroSchemaReflectClass=com.example.Event!

17

Page 18: Building Hadoop Data Applications with Kite by Tom White

A(peek(at(the(Avro(schema(

$ hive -e "DESCRIBE EXTENDED events"!...!{! "type" : "record",! "name" : "Event",! "namespace" : "com.example",! "fields" : [! { "name" : "id", "type" : "long" },! { "name" : "timestamp", "type" : "long" },! { "name" : "source", "type" : "string" }! ]!}!

18

Page 19: Building Hadoop Data Applications with Kite by Tom White

3.(Write(Events(

Logger logger = Logger.getLogger(...);!

Event event = new Event();!event.setId(id);!event.setTimestamp(System.currentTimeMillis());!event.setSource(source);!logger.info(event);!

19

Page 20: Building Hadoop Data Applications with Kite by Tom White

Log4j(configura;on(

log4j.appender.flume = org.kitesdk.data.flume.Log4jAppender!

log4j.appender.flume.Hostname = localhost!log4j.appender.flume.Port = 41415!log4j.appender.flume.DatasetRepositoryUri = repo:hive!log4j.appender.flume.DatasetName = events!

20

Page 21: Building Hadoop Data Applications with Kite by Tom White

The(resul;ng(file(layout(

/user! /hive! /warehouse! /events! /FlumeData.1375659013795! /FlumeData.1375659013796!

21

Avro(files(

Page 22: Building Hadoop Data Applications with Kite by Tom White

4.(Generate(Summaries(with(Crunch(

PCollection<Event> events = read(asSource(repo.load("events"), Event.class));!

PCollection<Summary> summaries = events! .by(new GetTimeBucket(), // minute of day, source! Avros.pairs(Avros.longs(), Avros.strings()))! .groupByKey()! .parallelDo(new MakeSummary(),! Avros.reflects(Summary.class));!

write(summaries, asTarget(repo.load("summaries"))!22

Page 23: Building Hadoop Data Applications with Kite by Tom White

…(and(run(using(Maven(

$ mvn kite:create-dataset -Dkite.datasetName=summaries ...!

<plugin>! <groupId>org.kitesdk</groupId>! <artifactId>kite-maven-plugin</artifactId>! <configuration>! <toolClass>com.example.GenerateSummaries</toolClass>! </configuration>!</plugin>!

$ mvn kite:run-tool!23

Page 24: Building Hadoop Data Applications with Kite by Tom White

5.(Query(with(Impala(

$ impala-shell -q ’DESCRIBE events'!

+-----------+--------+-------------------+!| name | type | comment |!+-----------+--------+-------------------+!| id | bigint | from deserializer |!| timestamp | bigint | from deserializer |!| source | string | from deserializer |!+-----------+--------+-------------------+!

24

Page 25: Building Hadoop Data Applications with Kite by Tom White

…(Ad(Hoc(Queries(

$ impala-shell -q 'SELECT source, COUNT(1) AS cnt FROM events GROUP BY source'!

+--------------------------------------+-----+!| source | cnt |!+--------------------------------------+-----+!| 018dc1b6-e6b0-489e-bce3-115917e00632 | 38 |!| bc80040e-09d1-4ad2-8bd8-82afd1b8431a | 85 |!+--------------------------------------+-----+!Returned 2 row(s) in 0.56s!

25

Page 26: Building Hadoop Data Applications with Kite by Tom White

…(or(use(JDBC(

Class.forName("org.apache.hive.jdbc.HiveDriver");!

Connection connection = DriverManager.getConnection(! "jdbc:hive2://localhost:21050/;auth=noSasl");!

Statement statement = connection.createStatement();!

ResultSet resultSet = statement.executeQuery(! "SELECT * FROM summaries");!

26

Page 27: Building Hadoop Data Applications with Kite by Tom White

Advanced(Kite(

27

Page 28: Building Hadoop Data Applications with Kite by Tom White

Unified(Storage(Interface(

• Dataset(–(streaming(access,(HDFS(storage(•  RandomAccessDataset(–(random(access,(HBase(storage(

•  Par;;onStrategy(defines(how(to(map(an(en;ty(to(par;;ons(in(HDFS(or(row(keys(in(HBase(

28

Page 29: Building Hadoop Data Applications with Kite by Tom White

Filesystem(Par;;ons(

PartitionStrategy p = new PartitionStrategy.Builder()! .year("timestamp")! .month("timestamp")! .day("timestamp").build();!

/user/hive/warehouse/events! /year=2014/month=02/day=08! /FlumeData.1375659013795! /FlumeData.1375659013796!

29

Page 30: Building Hadoop Data Applications with Kite by Tom White

HBase(Keys:(Defined(in(Avro(

{! "name": "username",! "type": "string",! "mapping": { "type": "key", "value": "0" }!},!{! "name": "favoriteColor",! "type": "string",! "mapping": { "type": "column", "value": "meta:fc" }!}!

30

Page 31: Building Hadoop Data Applications with Kite by Tom White

Random(Access(Dataset:(Crea;on(

RandomAccessDatasetRepository repo = DatasetRepositories.openRandomAccess(!

"repo:hbase:localhost");!

RandomAccessDataset<User> users = repo.load("users");!

users.put(new User("bill", "green"));!users.put(new User("alice", "blue"));!

31

Page 32: Building Hadoop Data Applications with Kite by Tom White

Random(Access(Dataset:(Retrieval(

Key key = new Key.Builder(users)! .add("username", "bill").build();!

User bill = users.get(key);!

32

Page 33: Building Hadoop Data Applications with Kite by Tom White

Views(

View<User> view = users.from("username", "bill");!

DatasetReader<User> reader = view.newReader();!reader.open();!for (User user : reader) {! System.out.println(user);!}!reader.close();!

33

Page 34: Building Hadoop Data Applications with Kite by Tom White

Parallel(Processing(

• Goal(is(for(Hadoop(processing(frameworks(to(“just(work”(

•  Support(Formats,(Par;;ons,(Views(

• Na;ve(Kite(components,(e.g.(DatasetOutputFormat(for(MR(

34

HDFS%Dataset% HBase%Dataset%

Crunch( Yes( 0.12.0(

MapReduce( 0.12.0( 0.12.0(

Impala( Yes( Planned(

Page 35: Building Hadoop Data Applications with Kite by Tom White

Schema(Evolu;on(

public class Event {! private long id;! private long timestamp;! private String source;! @Nullable private String ipAddress;!}!

$ mvn kite:update-dataset \! -Dkite.datasetName=events \! -Dkite.avroSchemaReflectClass=com.example.Event!

35

Page 36: Building Hadoop Data Applications with Kite by Tom White

Searchable(Datasets(

• Use(Flume(Solr(Sink((in(addi;on(to(HDFS(Sink)(

• Morphlines(library(to(define(fields(to(index(

•  SolrCloud(runs(on(cluster(from(indexes(in(HDFS(

•  Future(support(in(Kite(to(index(selected(fields(automa;cally(

36

Page 37: Building Hadoop Data Applications with Kite by Tom White

Conclusion(

37

Page 38: Building Hadoop Data Applications with Kite by Tom White

Kite(makes(it(easy(to(get(data(into(Hadoop(with(a(flexible(schema(model(that(is(storage(agnos;c(in(a(format(that(can(be(processed(

with(a(wide(range(of(Hadoop(tools(

38

Page 39: Building Hadoop Data Applications with Kite by Tom White

Gepng(Started(With(Kite(

•  Examples(at(github.com/kiteNsdk/kiteNexamples(

• Working(with(streaming(and(randomNaccess(datasets(

•  Logging(events(to(datasets(from(a(webapp(•  Running(a(periodic(job(

• Migra;ng(data(from(CSV(to(a(Kite(dataset(

•  Conver;ng(an(Avro(dataset(to(a(Parquet(dataset(

• Wri;ng(and(configuring(Morphlines(

• Using(Morphlines(to(write(JSON(records(to(a(dataset(

39

Page 40: Building Hadoop Data Applications with Kite by Tom White

Ques;ons?(

kitesdk.org(

@tom_e_white(

[email protected](

40

Page 41: Building Hadoop Data Applications with Kite by Tom White

41 41

Page 42: Building Hadoop Data Applications with Kite by Tom White

About(me(

•  Engineer(at(Cloudera(working(on(Core(Hadoop(and(Kite(

• Apache(Hadoop(CommiZer,(PMC(Member,(Apache(Member(

• Author(of((“Hadoop:(The(Defini;ve(Guide”(

42

Page 43: Building Hadoop Data Applications with Kite by Tom White

Morphlines(Example(

43

morphlines(:([(({((((id(:(morphline1((((importCommands(:(["com.cloudera.**",("org.apache.solr.**"]((((commands(:([(((((({(readLine({}(}(((((((((((((((((((((((((({(((((((((grok({(((((((((((dic;onaryFiles(:([/tmp/grokNdic;onaries](((((((((((((((((((((((((((((((((((((((((expressions(:({(((((((((((((message(:("""<%{POSINT:syslog_pri}>%{SYSLOGTIMESTAMP:syslog_;mestamp}(%{SYSLOGHOST:syslog_hostname}(%{DATA:syslog_program}(?:\[%{POSINT:syslog_pid}\])?:(%{GREEDYDATA:syslog_message}"""((((((((((}((((((((}((((((}(((((({(loadSolr({}(}((((((((((]((}(](

Example Input!<164>Feb  4 10:46:14 syslog sshd[607]: listening on 0.0.0.0 port 22!Output Record!syslog_pri:164!syslog_timestamp:Feb  4 10:46:14!syslog_hostname:syslog!syslog_program:sshd!syslog_pid:607!syslog_message:listening on 0.0.0.0 port 22.!

Page 44: Building Hadoop Data Applications with Kite by Tom White

Apps(

• App(–(a(packaged(Java(program(that(runs(on(a(Hadoop(cluster(

•  cdk:packageNapp(–(create(a(package(on(the(local(filesystem(

•  like(an(exploded(WAR(• Oozie(format(

•  cdk:deployNapp(–(copy(packaged(app(to(HDFS(

•  cdk:runNapp(–(execute(the(app(

• Workflow(app(–(runs(once(

•  Coordinator(app(–(runs(other(apps((like(cron)(

44


Top Related