Download - Map/Confused? A practical approach to Map/Reduce with MongoDB

http://research.google.com/archive/mapreduce.html

http://jira.mongodb.org/browse/SERVER-1197




{

"_id" : ObjectId("4fb9fb91d066d657de8d6f36"),

"text" : “MongoDB uses Map/Reduce #epic #win",

…

"user" : {

"friends_count" : 73,

…

"followers_count" : 102,

"id" : 53507833,

},

…

}

mongod --rest --shardsvr --port 27017 --dbpath /tmp/shard1/ --smallfiles

mongod --rest --shardsvr --port 27017 --dbpath /tmp/shard1/ --smallfiles

mongod --configsvr --port 10000 --dbpath /tmp/config/ --smallfiles

mongos --port 22222 --configdb localhost:10000

1.db.tweets.mapReduce()

2.db.tweets.group()

3.db.tweets.aggregate()

4.MongoDB-Hadoop Adapter

5.db.tweets.find()

var measure = function(c) {

var a = Date.now();

var results = c.apply();

var d = Date.now() - a;

return { results:results, duration:d };

};

function() {

if (this.user != null) {

emit("user",

{userName: this.user.name,

followers: this.user.followers_count});

}

}

function(key, values) {

var result = null;

values.forEach( function(value) {

if (result == null ||

result.followers < value.followers) {

result = value;

}

})

return result;

}

db.tweets.group({

key: {},

initial: { name:'', followers_count:0 },

reduce: function(obj,prev) {

if (obj.user != null &&

prev.followers_count < obj.user.followers_count)

{

prev.name = obj.user.name;

prev.followers_count = obj.user.followers_count;

}

}

})

db.tweets.aggregate(

{$group: {

_id: {user_name: "$user.name"},

followers_count: {$max: "$user.followers_count"}

}},

{$sort: {"followers_count" : -1}},

{$limit : 1},

{$project: {

_id : 0,

user_name : "$_id.user_name",

followers_count : "$followers_count"

}})

#!/usr/bin/env python

# encoding: utf-8

import sys

sys.path.append(".")

from pymongo_hadoop import BSONMapper

def mapper(documents):

for doc in documents:

if doc['user'] != None:

yield {'_id': doc['user']['name'].encode('utf-8'),

'followers':doc['user']['followers_count']}

BSONMapper(mapper)

print >> sys.stderr, "Done Mapping!"

#!/usr/bin/env python

# encoding: utf-8

import sys

sys.path.append('.')

from pymongo_hadoop import BSONReducer

def reducer(key, values):

print >> sys.stderr, "Processing key %s" % key.encode('utf-8')

_count = 0

for v in values:

if _count < v['followers']:

_count = v["followers"]

return {"_id": key.encode('utf-8'), "count": _count}

BSONReducer(reducer)

print >> sys.stderr, "Done Reducing!"

hadoop jar /usr/lib/hadoop/lib/mongo-hadoop-streaming-

assembly-1.1.0-SNAPSHOT.jar

-files mapper.py, reducer.py

-inputURI mongodb://localhost:27017/twitter.tweets

-outputURI mongodb://localhost:27017/twitter.top_user

-mapper mapper.py

-reducer reducer.py

db.tweets.find().sort( {"user.followers_count": -1} ).limit(1)

db.tweets.mapReduce()

db.tweets.group()

db.tweets.aggregate()

MongoDB-Hadoop Adapter

db.tweets.find()

http://cdn.blog-sap.com/innovation/files/2012/08/MapReduceFunnel.jpg





http://stackoverflow.com/questions/12139149/mapreduce-with-mongodb-really-really-slow-30-hours-vs-20-minutes-in-mysql-for




























http://stackoverflow.com/questions/3947889/mongodb-terrible-mapreduce-performance/3951871








https://jira.mongodb.org/browse/SERVER-1197