????????:CentOS 6.4?? hadoop-2.0.0-cdh4.2.0?? JDK 1.6?? spark-0.8.0-incubating-bin-cdh4.tar.gz??Scala 2.9.3
????1. ??????????????
?????ο????????Spark 0.8???(CentOS6.4) - ?????????????
????2. ????????
????????????????????????????????????json????(????DATA[1-9].json)??
????{"id":10??"first_name":"Ralph"??"last_name":"Kennedy"??"country":"Colombia"??"ip_address":"12.211.41.162"??"email":"rkennedy@oyonder.net"}??
????{"id":11??"first_name":"Gary"??"last_name":"Cole"??"country":"Nepal"??"ip_address":"242.67.150.18"??"email":"gcole@browsebug.info"}??
????…
?????????????????????100M???????????linux cp / cat?????????????????????????????С?????????????
????????????
??????????*.json?????ip??????м???????????ip???????????“241.*”ip??????????????????????HDFS?????
????2.1??? Spark ???
??????master?????
$>cd ~/spark-0.8.0
$>bin/start-all.sh
????????????????
$> jps
11055 Jps
2313 SecondaryNameNode
2409 JobTracker
2152 NameNode
4822 Master
???????master??web UI(???http://localhost:8080). ??????????????????е?word???????????CPU???????????????
????2.2????spark-shell??HDFS???????????IP???
// set the master node of spark cluster and runspark-shell
$> MASTER=spark://centos01:7077./spark-shell
// read the json data
$>val file = sc.textFile("hdfs://sdc/user/hadoop/In/DATA*.json")
// filter the json data
$>val ips = file.filter(line => line.contains("ip_address"))
// Count all the IP
$>ips.count()
// Count all the“241.*”IP
$>ips.filter(line => line.contains("241.")).count()
$>ips.filter(line => line.contains("241.")).collect()
????2.3 ???н??