PySpark 支持 Hadoop, local file system, HDFS, Cassandra, HBase, Amazon S3, etc. Spark supports text files, SequenceFiles, and any other Hadoop InputFormat.
data = [1, 2, 3, 4, 5] # 多cpu并行计算,如sc.parallelize(data, 4) distData = sc.parallelize(data) distData.reduce(lambda a, b: a + b)
distFile = sc.textFile("README.md") # 计算行数 distFile.map(lambda s: len(s)).reduce(lambda a, b: a + b)
conf = {"es.resource" : "index/type"} # assume Elasticsearch is running on localhost defaults rdd = sc.newAPIHadoopRDD("org.elasticsearch.hadoop.mr.EsInputFormat", "org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf) rdd.first() # the result is a MapWritable that is converted to a Python dict (u'Elasticsearch ID', {u'field1': True, u'field2': u'Some Text', u'field3': 12345})
lines = sc.textFile("data.txt") lineLengths = lines.map(lambda s: len(s)) # 等下还需要使用时,可以持久化 lineLengths.persist() totalLength = lineLengths.reduce(lambda a, b: a + b)