1 1
Headline(Goes(Here(Speaker(Name(or(Subhead(Goes(Here(
Building(Hadoop(Data(Applica;ons(with(Kite(
Tom(White(@tom_e_white(The(Hive,(February(18,(2014(
Hadoop(0.1(
% cat bigdata.txt | hadoop fs -put - in!% hadoop MyJob in out!% hadoop fs -get out!
2
Characteris;cs(
• Batch(applica;ons(only(
• LowNlevel(coding(
• File(format(• Serializa;on(
• Par;;oning(scheme(
3
A(Hadoop(Stack(
4
Applica;ons(
• [Batch](Analyze(an(archive(of(songs1(
• [Interac;ve(SQL](Ad(hoc(queries(on(recommenda;ons(from(social(media(applica;ons2(
• [Search](Searching(email(traffic(in(nearNreal;me3(
• [ML](Detec;ng(fraudulent(transac;ons(using(clustering4(
5
[1](hZp://blog.cloudera.com/blog/2012/08/processNaNmillionNsongsNwithNapacheNpig/(([2](hZp://blog.cloudera.com/blog/2014/01/howNwajamNanswersNbusinessNques;onsNfasterNwithNhadoop/(([3](hZp://blog.cloudera.com/blog/2013/09/emailNindexingNusingNclouderaNsearch/(([4](hZp://blog.cloudera.com/blog/2013/03/cloudera_ml_data_science_tools/((
Outline(
• A(Typical(Applica;on(
• Kite(SDK(
• An(Example(• Advanced(Kite(
• Conclusion(
• Ques;ons(
6
A(typical(applica;on((zoom(100:1)(
7
A(typical(applica;on((zoom(10:1)(
8
A(typical(pipeline((zoom(5:1)(
9
Kite(SDK(
10
Kite(Codifies(Best(Prac;ce(as(APIs,(Tools,(Docs(and(Examples(
11
Kite(
• A(clientNside(library(for(wri;ng(Hadoop(Data(Applica;ons(
• First(release(was(in(April(2013(as(CDK(
• 0.11.0(earlier(this(month(• Open(source,(Apache(2(license,(kitesdk.org(
• Modular(
• Data(module((HDFS,(Flume,(Crunch,(Hive,(HBase)(
• Morphlines(transforma;on(module(
• Maven(plugin(
12
An(Example(
13
Kite(Data(Module(
• Dataset(–(a(collec;on(of(en;;es(
• DatasetRepository(–(physical(storage(loca;on(for(datasets(
• DatasetDescriptor(–(holds(dataset(metadata((schema,(format)(• DatasetWriter(–(write(en;;es(to(a(dataset(in(a(stream(
• DatasetReader(–(read(en;;es(from(a(dataset((
14
1.(Define(the(Event(En;ty(
public class Event {! private long id;! private long timestamp;! private String source;!
// getters and setters!}!
15
2.(Create(the(Events(Dataset(
DatasetRepository repo = DatasetRepositories.open("repo:hive");!
DatasetDescriptor descriptor =! new DatasetDescriptor.Builder()! .schema(Event.class).build();!
repo.create("events", descriptor);!
16
(2.(or(with(the(Maven(plugin)(
$ mvn kite:create-dataset \! -Dkite.repositoryUri='repo:hive' \! -Dkite.datasetName=events \! -Dkite.avroSchemaReflectClass=com.example.Event!
17
A(peek(at(the(Avro(schema(
$ hive -e "DESCRIBE EXTENDED events"!...!{! "type" : "record",! "name" : "Event",! "namespace" : "com.example",! "fields" : [! { "name" : "id", "type" : "long" },! { "name" : "timestamp", "type" : "long" },! { "name" : "source", "type" : "string" }! ]!}!
18
3.(Write(Events(
Logger logger = Logger.getLogger(...);!
Event event = new Event();!event.setId(id);!event.setTimestamp(System.currentTimeMillis());!event.setSource(source);!logger.info(event);!
19
Log4j(configura;on(
log4j.appender.flume = org.kitesdk.data.flume.Log4jAppender!
log4j.appender.flume.Hostname = localhost!log4j.appender.flume.Port = 41415!log4j.appender.flume.DatasetRepositoryUri = repo:hive!log4j.appender.flume.DatasetName = events!
20
The(resul;ng(file(layout(
/user! /hive! /warehouse! /events! /FlumeData.1375659013795! /FlumeData.1375659013796!
21
Avro(files(
4.(Generate(Summaries(with(Crunch(
PCollection<Event> events = read(asSource(repo.load("events"), Event.class));!
PCollection<Summary> summaries = events! .by(new GetTimeBucket(), // minute of day, source! Avros.pairs(Avros.longs(), Avros.strings()))! .groupByKey()! .parallelDo(new MakeSummary(),! Avros.reflects(Summary.class));!
write(summaries, asTarget(repo.load("summaries"))!22
…(and(run(using(Maven(
$ mvn kite:create-dataset -Dkite.datasetName=summaries ...!
<plugin>! <groupId>org.kitesdk</groupId>! <artifactId>kite-maven-plugin</artifactId>! <configuration>! <toolClass>com.example.GenerateSummaries</toolClass>! </configuration>!</plugin>!
$ mvn kite:run-tool!23
5.(Query(with(Impala(
$ impala-shell -q ’DESCRIBE events'!
+-----------+--------+-------------------+!| name | type | comment |!+-----------+--------+-------------------+!| id | bigint | from deserializer |!| timestamp | bigint | from deserializer |!| source | string | from deserializer |!+-----------+--------+-------------------+!
24
…(Ad(Hoc(Queries(
$ impala-shell -q 'SELECT source, COUNT(1) AS cnt FROM events GROUP BY source'!
+--------------------------------------+-----+!| source | cnt |!+--------------------------------------+-----+!| 018dc1b6-e6b0-489e-bce3-115917e00632 | 38 |!| bc80040e-09d1-4ad2-8bd8-82afd1b8431a | 85 |!+--------------------------------------+-----+!Returned 2 row(s) in 0.56s!
25
…(or(use(JDBC(
Class.forName("org.apache.hive.jdbc.HiveDriver");!
Connection connection = DriverManager.getConnection(! "jdbc:hive2://localhost:21050/;auth=noSasl");!
Statement statement = connection.createStatement();!
ResultSet resultSet = statement.executeQuery(! "SELECT * FROM summaries");!
26
Advanced(Kite(
27
Unified(Storage(Interface(
• Dataset(–(streaming(access,(HDFS(storage(• RandomAccessDataset(–(random(access,(HBase(storage(
• Par;;onStrategy(defines(how(to(map(an(en;ty(to(par;;ons(in(HDFS(or(row(keys(in(HBase(
28
Filesystem(Par;;ons(
PartitionStrategy p = new PartitionStrategy.Builder()! .year("timestamp")! .month("timestamp")! .day("timestamp").build();!
/user/hive/warehouse/events! /year=2014/month=02/day=08! /FlumeData.1375659013795! /FlumeData.1375659013796!
29
HBase(Keys:(Defined(in(Avro(
{! "name": "username",! "type": "string",! "mapping": { "type": "key", "value": "0" }!},!{! "name": "favoriteColor",! "type": "string",! "mapping": { "type": "column", "value": "meta:fc" }!}!
30
Random(Access(Dataset:(Crea;on(
RandomAccessDatasetRepository repo = DatasetRepositories.openRandomAccess(!
"repo:hbase:localhost");!
RandomAccessDataset<User> users = repo.load("users");!
users.put(new User("bill", "green"));!users.put(new User("alice", "blue"));!
31
Random(Access(Dataset:(Retrieval(
Key key = new Key.Builder(users)! .add("username", "bill").build();!
User bill = users.get(key);!
32
Views(
View<User> view = users.from("username", "bill");!
DatasetReader<User> reader = view.newReader();!reader.open();!for (User user : reader) {! System.out.println(user);!}!reader.close();!
33
Parallel(Processing(
• Goal(is(for(Hadoop(processing(frameworks(to(“just(work”(
• Support(Formats,(Par;;ons,(Views(
• Na;ve(Kite(components,(e.g.(DatasetOutputFormat(for(MR(
34
HDFS%Dataset% HBase%Dataset%
Crunch( Yes( 0.12.0(
MapReduce( 0.12.0( 0.12.0(
Impala( Yes( Planned(
Schema(Evolu;on(
public class Event {! private long id;! private long timestamp;! private String source;! @Nullable private String ipAddress;!}!
$ mvn kite:update-dataset \! -Dkite.datasetName=events \! -Dkite.avroSchemaReflectClass=com.example.Event!
35
Searchable(Datasets(
• Use(Flume(Solr(Sink((in(addi;on(to(HDFS(Sink)(
• Morphlines(library(to(define(fields(to(index(
• SolrCloud(runs(on(cluster(from(indexes(in(HDFS(
• Future(support(in(Kite(to(index(selected(fields(automa;cally(
36
Conclusion(
37
Kite(makes(it(easy(to(get(data(into(Hadoop(with(a(flexible(schema(model(that(is(storage(agnos;c(in(a(format(that(can(be(processed(
with(a(wide(range(of(Hadoop(tools(
38
Gepng(Started(With(Kite(
• Examples(at(github.com/kiteNsdk/kiteNexamples(
• Working(with(streaming(and(randomNaccess(datasets(
• Logging(events(to(datasets(from(a(webapp(• Running(a(periodic(job(
• Migra;ng(data(from(CSV(to(a(Kite(dataset(
• Conver;ng(an(Avro(dataset(to(a(Parquet(dataset(
• Wri;ng(and(configuring(Morphlines(
• Using(Morphlines(to(write(JSON(records(to(a(dataset(
39
41 41
About(me(
• Engineer(at(Cloudera(working(on(Core(Hadoop(and(Kite(
• Apache(Hadoop(CommiZer,(PMC(Member,(Apache(Member(
• Author(of((“Hadoop:(The(Defini;ve(Guide”(
42
Morphlines(Example(
43
morphlines(:([(({((((id(:(morphline1((((importCommands(:(["com.cloudera.**",("org.apache.solr.**"]((((commands(:([(((((({(readLine({}(}(((((((((((((((((((((((((({(((((((((grok({(((((((((((dic;onaryFiles(:([/tmp/grokNdic;onaries](((((((((((((((((((((((((((((((((((((((((expressions(:({(((((((((((((message(:("""<%{POSINT:syslog_pri}>%{SYSLOGTIMESTAMP:syslog_;mestamp}(%{SYSLOGHOST:syslog_hostname}(%{DATA:syslog_program}(?:\[%{POSINT:syslog_pid}\])?:(%{GREEDYDATA:syslog_message}"""((((((((((}((((((((}((((((}(((((({(loadSolr({}(}((((((((((]((}(](
Example Input!<164>Feb 4 10:46:14 syslog sshd[607]: listening on 0.0.0.0 port 22!Output Record!syslog_pri:164!syslog_timestamp:Feb 4 10:46:14!syslog_hostname:syslog!syslog_program:sshd!syslog_pid:607!syslog_message:listening on 0.0.0.0 port 22.!
Apps(
• App(–(a(packaged(Java(program(that(runs(on(a(Hadoop(cluster(
• cdk:packageNapp(–(create(a(package(on(the(local(filesystem(
• like(an(exploded(WAR(• Oozie(format(
• cdk:deployNapp(–(copy(packaged(app(to(HDFS(
• cdk:runNapp(–(execute(the(app(
• Workflow(app(–(runs(once(
• Coordinator(app(–(runs(other(apps((like(cron)(
44