Diving into MongoDB Aggregation Framework


To inject basics of MongoDB Aggregation framework into our veins lets start with inserting following data:


db.residential_data.insert({“state_name”:”Haryana”,
“building_usage_data”:
[
{“area_type”:”Urban”,”building_usage”:
[
{“usage_type”:”House”,”total”:1663832},
{“usage_type”:”Shop”,”total”:352312 },
{“usage_type”:”Hotel”,”total”:5582 },
{“usage_type”:”Hospital”,”total”:5752},
{“usage_type”:”Temple”,”total”:8442},
{“usage_type”:”Factory”,”total”:3842},
{“usage_type”:”Shopping Mall”,”total”:742},
{“usage_type”:”Cinema Hall”,”total”:642}
]
},
{“area_type”:”Rural”,”building_usage”:
[
{“usage_type”:”House”,”total”:1963832},
{“usage_type”:”Shop”,”total”:302312},
{“usage_type”:”Hospital”,”total”:2752},
{“usage_type”:”Temple”,”total”:9442},
{“usage_type”:”Factory”,”total”:4842},
{“usage_type”:”Resort”,”total”:842}
]
}
]});
db.residential_data.insert({“state_name”:”Punjab”,
“building_usage_data”:
[
{“area_type”:”Urban”,”building_usage”:
[
{“usage_type”:”House”,”total”:1963832},
{“usage_type”:”Shop”,”total”:392312 },
{“usage_type”:”Hotel”,”total”:6582 },
{“usage_type”:”Hospital”,”total”:7752},
{“usage_type”:”Temple”,”total”:9442},
{“usage_type”:”Factory”,”total”:3542},
{“usage_type”:”Shopping Mall”,”total”:772},
{“usage_type”:”Cinema Hall”,”total”:622}
]
},
{“area_type”:”Rural”,”building_usage”:
[
{“usage_type”:”House”,”total”:2063832},
{“usage_type”:”Shop”,”total”:312312},
{“usage_type”:”Hospital”,”total”:3052},
{“usage_type”:”Temple”,”total”:10442},
{“usage_type”:”Factory”,”total”:3842},
{“usage_type”:”Resort”,”total”:1842}
]
}
]});

db.residential_data.insert({“state_name”:”Bihar”,
“building_usage_data”:
[
{“area_type”:”Urban”,”building_usage”:
[
{“usage_type”:”House”,”total”:3063832},
{“usage_type”:”Shop”,”total”:72312 },
{“usage_type”:”Hotel”,”total”:6582 },
{“usage_type”:”Hospital”,”total”:1752},
{“usage_type”:”Temple”,”total”:17442},
{“usage_type”:”Factory”,”total”:8542},
{“usage_type”:”Shopping Mall”,”total”:572},
{“usage_type”:”Cinema Hall”,”total”:722}
]
},
{“area_type”:”Rural”,”building_usage”:
[
{“usage_type”:”House”,”total”:2363832},
{“usage_type”:”Shop”,”total”:10312},
{“usage_type”:”Hospital”,”total”:1052},
{“usage_type”:”Temple”,”total”:13442},
{“usage_type”:”Factory”,”total”:8942},
{“usage_type”:”Resort”,”total”:242}
]
}
]});

db.residential_data.insert({“state_name”:”Himachal”,
“building_usage_data”:
[
{“area_type”:”Urban”,”building_usage”:
[
{“usage_type”:”House”,”total”:1063832},
{“usage_type”:”Shop”,”total”:22312 },
{“usage_type”:”Hotel”,”total”:16582 },
{“usage_type”:”Hospital”,”total”:5752},
{“usage_type”:”Temple”,”total”:10442},
{“usage_type”:”Factory”,”total”:2542},
{“usage_type”:”Shopping Mall”,”total”:502},
{“usage_type”:”Cinema Hall”,”total”:1122}
]
},
{“area_type”:”Rural”,”building_usage”:
[
{“usage_type”:”House”,”total”:363832},
{“usage_type”:”Shop”,”total”:9312},
{“usage_type”:”Hospital”,”total”:1252},
{“usage_type”:”Temple”,”total”:14442},
{“usage_type”:”Factory”,”total”:1942},
{“usage_type”:”Resort”,”total”:3242}
]
}
]});

db.residential_data.insert({“state_name”:”Rajsthan”,
“building_usage_data”:
[
{“area_type”:”Urban”,”building_usage”:
[
{“usage_type”:”House”,”total”:7963832},
{“usage_type”:”Shop”,”total”:172312 },
{“usage_type”:”Hotel”,”total”:26582 },
{“usage_type”:”Hospital”,”total”:19752},
{“usage_type”:”Temple”,”total”:7442},
{“usage_type”:”Factory”,”total”:5542},
{“usage_type”:”Shopping Mall”,”total”:1572},
{“usage_type”:”Cinema Hall”,”total”:2722}
]
},
{“area_type”:”Rural”,”building_usage”:
[
{“usage_type”:”House”,”total”:4363832},
{“usage_type”:”Shop”,”total”:90312},
{“usage_type”:”Hospital”,”total”:11052},
{“usage_type”:”Temple”,”total”:13442},
{“usage_type”:”Factory”,”total”:10942},
{“usage_type”:”Resort”,”total”:2242}
]
}
]});


Aggregation Pipeline

Aggregation pipeline is a series of transformations applied to documents to perform some aggregation tasks and output some cursor or a collection.There can be N numbers of transformation stages where output of first is fed into second , second into third and so on.

aggregation_pipeline

Pipeline operators

Below are the basic pipeline operators which we will use to perform some aggregation tasks over the data which we have created earlier.

  • $match
  • $unwind
  • $group
  • $project
  • $sort
  • $limit
  • $skip
$match

This is similar to SQL’s WHERE clause, to filter some data which is passed on to next stage.For example if for created data we want to perform some aggregation over data that belongs to “Urban” areas then $match operator can be used to filter out that data.


{“$match”:{“building_usage_data.area_type”:”Urban”}}


$unwind

This is used to expand document if it contains some data in the form of arrays.When a $unwind operator is applied to a array data, it will generate a new record for each element of that array.For example, when we run below query:


db.residential_data.aggregate([ {“$unwind”:”$building_usage_data”}]);


output would be something like as shown:

unwind_mongo_aggregation
$group

After flattening our data we can now easily group our data using $group.It is something similar to SQL’s GROUP BY clause.For example to group our data based upon usage_type of building,we should use below query:


db.residential_data.aggregate([
{“$unwind”:”$building_usage_data”},
{“$match”:{“building_usage_data.area_type”:”Urban”}},
{“$unwind”:”$building_usage_data.building_usage”},
{“$group”:{“_id”:{“BuildingType”:”$building_usage_data.building_usage.usage_type”},”Total”:{“$sum”:”$building_usage_data.building_usage.total”}}
}]);


And the output would be:


{ “_id” : { “BuildingType” : “Shopping Mall” }, “Total” : 4160 }
{ “_id” : { “BuildingType” : “Temple” }, “Total” : 53210 }
{ “_id” : { “BuildingType” : “Hospital” }, “Total” : 40760 }
{ “_id” : { “BuildingType” : “Hotel” }, “Total” : 61910 }
{ “_id” : { “BuildingType” : “Factory” }, “Total” : 24010 }
{ “_id” : { “BuildingType” : “Shop” }, “Total” : 1011560 }
{ “_id” : { “BuildingType” : “Cinema Hall” }, “Total” : 5830 }
{ “_id” : { “BuildingType” : “House” }, “Total” : 15719160 }


$project

It is something similar to SELECT clause of SQL.We can use it to select/deselect some field.If field is set to 0, it will not be passed to next stage in pipeline.

For example to select “BuildingType” and “Total” from above queried data, we should use below query:


db.residential_data.aggregate([
{“$unwind”:”$building_usage_data”},
{“$match”:{“building_usage_data.area_type”:”Urban”}},
{“$unwind”:”$building_usage_data.building_usage”},
{“$group”:{“_id”:{“BuildingType”:”$building_usage_data.building_usage.usage_type”},”Total”:{“$sum”:”$building_usage_data.building_usage.total”}}},
{“$project”:{“_id”:0,”BuildingType”:”$_id.BuildingType”,”Total”:”$Total”}}]);


And the output would be:


{ “Total” : 4160, “BuildingType” : “Shopping Mall” }
{ “Total” : 53210, “BuildingType” : “Temple” }
{ “Total” : 40760, “BuildingType” : “Hospital” }
{ “Total” : 61910, “BuildingType” : “Hotel” }
{ “Total” : 24010, “BuildingType” : “Factory” }
{ “Total” : 1011560, “BuildingType” : “Shop” }
{ “Total” : 5830, “BuildingType” : “Cinema Hall” }
{ “Total” : 15719160, “BuildingType” : “House” }


$sort

It is similar to SQL’s ORDER BY clause.To sort in descending order use  -1 and for ascending use 1.

For example when we run below query:


db.residential_data.aggregate([
{“$unwind”:”$building_usage_data”},
{“$match”:{“building_usage_data.area_type”:”Urban”}},
{“$unwind”:”$building_usage_data.building_usage”},
{“$group”:{“_id”:{“BuildingType”:”$building_usage_data.building_usage.usage_type”},”Total”:{“$sum”:”$building_usage_data.building_usage.total”}}},
{“$project”:{“_id”:0,”BuildingType”:”$_id.BuildingType”,”Total”:”$Total”}},
{$sort:{“Total”:-1}}
]);


output would be:


{ “Total” : 15719160, “BuildingType” : “House” }
{ “Total” : 1011560, “BuildingType” : “Shop” }
{ “Total” : 61910, “BuildingType” : “Hotel” }
{ “Total” : 53210, “BuildingType” : “Temple” }
{ “Total” : 40760, “BuildingType” : “Hospital” }
{ “Total” : 24010, “BuildingType” : “Factory” }
{ “Total” : 5830, “BuildingType” : “Cinema Hall” }
{ “Total” : 4160, “BuildingType” : “Shopping Mall” }


$limit and $skip

These are used to limit the number of records being returned.For example when we run below query only 4 records would be returned as we are limiting the number of records to 4.


db.residential_data.aggregate([
{“$unwind”:”$building_usage_data”},
{“$match”:{“building_usage_data.area_type”:”Urban”}},
{“$unwind”:”$building_usage_data.building_usage”},
{“$group”:{“_id”:{“BuildingType”:”$building_usage_data.building_usage.usage_type”},”Total”:{“$sum”:”$building_usage_data.building_usage.total”}}},
{“$project”:{“_id”:0,”BuildingType”:”$_id.BuildingType”,”Total”:”$Total”}},
{$sort:{“Total”:-1}},
{$limit:4}
]);


Output:


{ “Total” : 15719160, “BuildingType” : “House” }
{ “Total” : 1011560, “BuildingType” : “Shop” }
{ “Total” : 61910, “BuildingType” : “Hotel” }
{ “Total” : 53210, “BuildingType” : “Temple” }


Now to skip first 2 records run below query:


db.residential_data.aggregate([
{“$unwind”:”$building_usage_data”},
{“$match”:{“building_usage_data.area_type”:”Urban”}},
{“$unwind”:”$building_usage_data.building_usage”},
{“$group”:{“_id”:{“BuildingType”:”$building_usage_data.building_usage.usage_type”},”Total”:{“$sum”:”$building_usage_data.building_usage.total”}}},
{“$project”:{“_id”:0,”BuildingType”:”$_id.BuildingType”,”Total”:”$Total”}},
{$sort:{“Total”:-1}},
{$limit:4},
{$skip:2}
]);


Output:


{ “Total” : 61910, “BuildingType” : “Hotel” }
{ “Total” : 53210, “BuildingType” : “Temple” }


Advertisements