Solution :
Step 1 : Create an application with following code and store it in problem84.py
# Import SparkContext and SparkConf
from pyspark import SparkContext, SparkConf
# Create configuration object and set App name
conf = SparkConf().setAppName("CCA 175 Problem 85") sc = sparkContext(conf=conf)
#load data from hdfs
contentRDD = sc.textFile(MContent.txt")
#filter out non-empty lines
nonemptyjines = contentRDD.filter(lambda x: len(x) > 0)
#Split line based on space
words = nonempty_lines.ffatMap(lambda x: x.split(''}}
#Do the word count
wordcounts = words.map(lambda x: (x, 1)) \
reduceByKey(lambda x, y: x+y) \
map(lambda x: (x[1], x[0]}}.sortByKey(False}
for word in wordcounts.collect(): print(word)
#Save final data " wordcounts.saveAsTextFile("problem85")
step 2 : Submit this application
spark-submit -master yarn problem85.py
Submit