ElasticSearch group by multiple fields ElasticSearch group by multiple fields elasticsearch elasticsearch

ElasticSearch group by multiple fields


Starting from version 1.0 of ElasticSearch, the new aggregations API allows grouping by multiple fields, using sub-aggregations. Suppose you want to group by fields field1, field2 and field3:

{  "aggs": {    "agg1": {      "terms": {        "field": "field1"      },      "aggs": {        "agg2": {          "terms": {            "field": "field2"          },          "aggs": {            "agg3": {              "terms": {                "field": "field3"              }            }          }                  }      }    }  }}

Of course this can go on for as many fields as you'd like.

Update:
For completeness, here is how the output of the above query looks. Also below is python code for generating the aggregation query and flattening the result into a list of dictionaries.

{  "aggregations": {    "agg1": {      "buckets": [{        "doc_count": <count>,        "key": <value of field1>,        "agg2": {          "buckets": [{            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            },            {            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            }, ...          ]        },        {        "doc_count": <count>,        "key": <value of field1>,        "agg2": {          "buckets": [{            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            },            {            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            }, ...          ]        }, ...      ]    }  }}

The following python code performs the group-by given the list of fields. I you specify include_missing=True, it also includes combinations of values where some of the fields are missing (you don't need it if you have version 2.0 of Elasticsearch thanks to this)

def group_by(es, fields, include_missing):    current_level_terms = {'terms': {'field': fields[0]}}    agg_spec = {fields[0]: current_level_terms}    if include_missing:        current_level_missing = {'missing': {'field': fields[0]}}        agg_spec[fields[0] + '_missing'] = current_level_missing    for field in fields[1:]:        next_level_terms = {'terms': {'field': field}}        current_level_terms['aggs'] = {            field: next_level_terms,        }        if include_missing:            next_level_missing = {'missing': {'field': field}}            current_level_terms['aggs'][field + '_missing'] = next_level_missing            current_level_missing['aggs'] = {                field: next_level_terms,                field + '_missing': next_level_missing,            }            current_level_missing = next_level_missing        current_level_terms = next_level_terms    agg_result = es.search(body={'aggs': agg_spec})['aggregations']    return get_docs_from_agg_result(agg_result, fields, include_missing)def get_docs_from_agg_result(agg_result, fields, include_missing):    current_field = fields[0]    buckets = agg_result[current_field]['buckets']    if include_missing:        buckets.append(agg_result[(current_field + '_missing')])    if len(fields) == 1:        return [            {                current_field: bucket.get('key'),                'doc_count': bucket['doc_count'],            }            for bucket in buckets if bucket['doc_count'] > 0        ]    result = []    for bucket in buckets:        records = get_docs_from_agg_result(bucket, fields[1:], include_missing)        value = bucket.get('key')        for record in records:            record[current_field] = value        result.extend(records)    return result


As you only have 2 fields a simple way is doing two queries with single facets. For Male:

{    "query" : {      "term" : { "gender" : "Male" }    },    "facets" : {        "age_range" : {            "terms" : {                "field" : "age_range"            }        }    }}

And for female:

{    "query" : {      "term" : { "gender" : "Female" }    },    "facets" : {        "age_range" : {            "terms" : {                "field" : "age_range"            }        }    }}

Or you can do it in a single query with a facet filter (see this link for further information)

{    "query" : {       "match_all": {}    },    "facets" : {        "age_range_male" : {            "terms" : {                "field" : "age_range"            },            "facet_filter":{                "term": {                    "gender": "Male"                }            }        },        "age_range_female" : {            "terms" : {                "field" : "age_range"            },            "facet_filter":{                "term": {                    "gender": "Female"                }            }        }    }}

Update:

As facets are about to be removed. This is the solution with aggregations:

{  "query": {    "match_all": {}  },  "aggs": {    "male": {      "filter": {        "term": {          "gender": "Male"        }      },      "aggs": {        "age_range": {          "terms": {            "field": "age_range"          }        }      }    },    "female": {      "filter": {        "term": {          "gender": "Female"        }      },      "aggs": {        "age_range": {          "terms": {            "field": "age_range"          }        }      }    }  }}


The aggregations API allows grouping by multiple fields, using sub-aggregations. Suppose you want to group by fields field1, field2 and field3:

{  "aggs": {    "agg1": {      "terms": {        "field": "field1"      },      "aggs": {        "agg2": {          "terms": {            "field": "field2"          },          "aggs": {            "agg3": {              "terms": {                "field": "field3"              }            }          }                  }      }    }  }}

Of course this can go on for as many fields as you'd like.

Update:
For completeness, here is how the output of the above query looks. Also below is python code for generating the aggregation query and flattening the result into a list of dictionaries.

{  "aggregations": {    "agg1": {      "buckets": [{        "doc_count": <count>,        "key": <value of field1>,        "agg2": {          "buckets": [{            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            },            {            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            }, ...          ]        },        {        "doc_count": <count>,        "key": <value of field1>,        "agg2": {          "buckets": [{            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            },            {            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            }, ...          ]        }, ...      ]    }  }}

The following python code performs the group-by given the list of fields. I you specify include_missing=True, it also includes combinations of values where some of the fields are missing (you don't need it if you have version 2.0 of Elasticsearch thanks to this)

def group_by(es, fields, include_missing):    current_level_terms = {'terms': {'field': fields[0]}}    agg_spec = {fields[0]: current_level_terms}    if include_missing:        current_level_missing = {'missing': {'field': fields[0]}}        agg_spec[fields[0] + '_missing'] = current_level_missing    for field in fields[1:]:        next_level_terms = {'terms': {'field': field}}        current_level_terms['aggs'] = {            field: next_level_terms,        }        if include_missing:            next_level_missing = {'missing': {'field': field}}            current_level_terms['aggs'][field + '_missing'] = next_level_missing            current_level_missing['aggs'] = {                field: next_level_terms,                field + '_missing': next_level_missing,            }            current_level_missing = next_level_missing        current_level_terms = next_level_terms    agg_result = es.search(body={'aggs': agg_spec})['aggregations']    return get_docs_from_agg_result(agg_result, fields, include_missing)def get_docs_from_agg_result(agg_result, fields, include_missing):    current_field = fields[0]    buckets = agg_result[current_field]['buckets']    if include_missing:        buckets.append(agg_result[(current_field + '_missing')])    if len(fields) == 1:        return [            {                current_field: bucket.get('key'),                'doc_count': bucket['doc_count'],            }            for bucket in buckets if bucket['doc_count'] > 0        ]    result = []    for bucket in buckets:        records = get_docs_from_agg_result(bucket, fields[1:], include_missing)        value = bucket.get('key')        for record in records:            record[current_field] = value        result.extend(records)    return result