import sys
for line in sys.stdin:
word = line.strip()
print(word+'\t'+'1')
reducer.py
import sys
cur_word = None
sum = 0
for line in sys.stdin:
word,val = line.strip().split('\t')
if cur_word==None:
cur_word = word
if cur_word!=word:
print('%s\t%s'%(cur_word,sum))
cur_word = word
sum = 0
sum+=int(val)
print('%s\t%s'%(cur_word,sum))
import sys
for line in sys.stdin:
item = line.strip().split('\t')
print(item[1]+'\t'+item[2])
reduce.py
import sys
item_score = {}
for line in sys.stdin:
line = line.strip()
item, score = line.split('\t')
if item in item_score:
item_score[item].append(int(score))
else:
item_score[item] = []
item_score[item].append(int(score))
for item in item_score.keys():
ave_score = sum(item_score[item])*1.0 / len(item_score[item])
print '%s\t%s'% (item, ave_score)
import sys
for line in sys.stdin:
line = line.strip()
line = line.split(" ")
user = "-1"
item = "-1"
score = "-1"
item_name = "-1"
item_time = "-1"
if len(line) ==4:
user = line[0]
item = line[1]
score = line[2]
else:
item = line[0]
item_name = line[1]
item_score = line[2]
print '%s\t%s\t%s\t%s\t%s' % (user, item, score, item_name, item_time)
reducer.py
import sys
item_dict ={}
ui_dict={}
for line in sys.stdin:
line = line.strip()
user,item,score,item_name,item_time = line.split('\t')
if user == "-1":
item_dict[item] = [item_name,item_time]
else:
ui_dict[user] = [item,score]
for user in ui_dict.keys():
item_name = item_dict[ui_dict[user][0]]
item_time = item_dict[ui_dict[user][1]]
item = customer_dict[id][1]
score = customer_dict[id][2]
print '%s\t%s\t%s\t%s'% (user, item, score, item_name, item_time)
使用mr实现去重任务。 3.1 准备数据
1
2
3
4
5
6
1
2
3
3
其他步骤同上 3.2 map/reduce/run
mapper.py
import sys
for line in sys.stdin:
print(line+'\t'+' ')
reducer.py
import sys
last_key = None
for line in sys.stdin:
this_key = line.split('\t')[0].strip()
if this_key == last_key:
pass
else:
if last_key:
print(last_key)
last_key = this_key
print(this_key)
使用mr实现排序。 4.1 数据,使用上述计算的电影平均分作为输入
4.2 编写map/reduce/run
import sys
for line in sys.stdin:
line = line.strip()
print('{0}'.format(line))
import sys
for line in sys.stdin:
line = line.strip()
print("{0}".format(line))
import os
import sys
docname = os.environ["map_input_file"]
for line in sys.stdin:
line = line.strip().split(' ')
for word in line:
print('{1}\t{2}'.format(line,docname)
reducer.py
import sys
word_doc_dict ={}
for line in sys.stdin:
line = line.strip()
word,docname = line.split('\t')
if word in word_doc_dict:
word_doc_dict[word].append(docname)
else:
word_doc_dict[word] = []
word_doc_dict[word].append(int(scdocname))
for word in word_doc_dict.keys():
print('{1}\t{2}'.format(word,','.join(word_doc_dict[word]))