ElasticSearch group by multiple fields

group-by elasticsearch faceted-search facet

Starting from version 1.0 of ElasticSearch, the new aggregations API allows grouping by multiple fields, using sub-aggregations. Suppose you want to group by fields field1, field2 and field3:

{  "aggs": {    "agg1": {      "terms": {        "field": "field1"      },      "aggs": {        "agg2": {          "terms": {            "field": "field2"          },          "aggs": {            "agg3": {              "terms": {                "field": "field3"              }            }          }                  }      }    }  }}

Of course this can go on for as many fields as you'd like.

Update:
For completeness, here is how the output of the above query looks. Also below is python code for generating the aggregation query and flattening the result into a list of dictionaries.

{  "aggregations": {    "agg1": {      "buckets": [{        "doc_count": <count>,        "key": <value of field1>,        "agg2": {          "buckets": [{            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            },            {            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            }, ...          ]        },        {        "doc_count": <count>,        "key": <value of field1>,        "agg2": {          "buckets": [{            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            },            {            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            }, ...          ]        }, ...      ]    }  }}

The following python code performs the group-by given the list of fields. I you specify include_missing=True, it also includes combinations of values where some of the fields are missing (you don't need it if you have version 2.0 of Elasticsearch thanks to this)

def group_by(es, fields, include_missing):    current_level_terms = {'terms': {'field': fields[0]}}    agg_spec = {fields[0]: current_level_terms}    if include_missing:        current_level_missing = {'missing': {'field': fields[0]}}        agg_spec[fields[0] + '_missing'] = current_level_missing    for field in fields[1:]:        next_level_terms = {'terms': {'field': field}}        current_level_terms['aggs'] = {            field: next_level_terms,        }        if include_missing:            next_level_missing = {'missing': {'field': field}}            current_level_terms['aggs'][field + '_missing'] = next_level_missing            current_level_missing['aggs'] = {                field: next_level_terms,                field + '_missing': next_level_missing,            }            current_level_missing = next_level_missing        current_level_terms = next_level_terms    agg_result = es.search(body={'aggs': agg_spec})['aggregations']    return get_docs_from_agg_result(agg_result, fields, include_missing)def get_docs_from_agg_result(agg_result, fields, include_missing):    current_field = fields[0]    buckets = agg_result[current_field]['buckets']    if include_missing:        buckets.append(agg_result[(current_field + '_missing')])    if len(fields) == 1:        return [            {                current_field: bucket.get('key'),                'doc_count': bucket['doc_count'],            }            for bucket in buckets if bucket['doc_count'] > 0        ]    result = []    for bucket in buckets:        records = get_docs_from_agg_result(bucket, fields[1:], include_missing)        value = bucket.get('key')        for record in records:            record[current_field] = value        result.extend(records)    return result

group-by elasticsearch faceted-search facet

As you only have 2 fields a simple way is doing two queries with single facets. For Male:

{    "query" : {      "term" : { "gender" : "Male" }    },    "facets" : {        "age_range" : {            "terms" : {                "field" : "age_range"            }        }    }}

And for female:

{    "query" : {      "term" : { "gender" : "Female" }    },    "facets" : {        "age_range" : {            "terms" : {                "field" : "age_range"            }        }    }}

Or you can do it in a single query with a facet filter (see this link for further information)

{    "query" : {       "match_all": {}    },    "facets" : {        "age_range_male" : {            "terms" : {                "field" : "age_range"            },            "facet_filter":{                "term": {                    "gender": "Male"                }            }        },        "age_range_female" : {            "terms" : {                "field" : "age_range"            },            "facet_filter":{                "term": {                    "gender": "Female"                }            }        }    }}

Update:

As facets are about to be removed. This is the solution with aggregations:

{  "query": {    "match_all": {}  },  "aggs": {    "male": {      "filter": {        "term": {          "gender": "Male"        }      },      "aggs": {        "age_range": {          "terms": {            "field": "age_range"          }        }      }    },    "female": {      "filter": {        "term": {          "gender": "Female"        }      },      "aggs": {        "age_range": {          "terms": {            "field": "age_range"          }        }      }    }  }}

elasticsearch group-by elasticsearch-query

The aggregations API allows grouping by multiple fields, using sub-aggregations. Suppose you want to group by fields field1, field2 and field3:

{  "aggs": {    "agg1": {      "terms": {        "field": "field1"      },      "aggs": {        "agg2": {          "terms": {            "field": "field2"          },          "aggs": {            "agg3": {              "terms": {                "field": "field3"              }            }          }                  }      }    }  }}

Of course this can go on for as many fields as you'd like.

Update:
For completeness, here is how the output of the above query looks. Also below is python code for generating the aggregation query and flattening the result into a list of dictionaries.

{  "aggregations": {    "agg1": {      "buckets": [{        "doc_count": <count>,        "key": <value of field1>,        "agg2": {          "buckets": [{            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            },            {            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            }, ...          ]        },        {        "doc_count": <count>,        "key": <value of field1>,        "agg2": {          "buckets": [{            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            },            {            "doc_count": <count>,            "key": <value of field2>,            "agg3": {              "buckets": [{                "doc_count": <count>,                "key": <value of field3>              },              {                "doc_count": <count>,                "key": <value of field3>              }, ...              ]            }, ...          ]        }, ...      ]    }  }}

def group_by(es, fields, include_missing):    current_level_terms = {'terms': {'field': fields[0]}}    agg_spec = {fields[0]: current_level_terms}    if include_missing:        current_level_missing = {'missing': {'field': fields[0]}}        agg_spec[fields[0] + '_missing'] = current_level_missing    for field in fields[1:]:        next_level_terms = {'terms': {'field': field}}        current_level_terms['aggs'] = {            field: next_level_terms,        }        if include_missing:            next_level_missing = {'missing': {'field': field}}            current_level_terms['aggs'][field + '_missing'] = next_level_missing            current_level_missing['aggs'] = {                field: next_level_terms,                field + '_missing': next_level_missing,            }            current_level_missing = next_level_missing        current_level_terms = next_level_terms    agg_result = es.search(body={'aggs': agg_spec})['aggregations']    return get_docs_from_agg_result(agg_result, fields, include_missing)def get_docs_from_agg_result(agg_result, fields, include_missing):    current_field = fields[0]    buckets = agg_result[current_field]['buckets']    if include_missing:        buckets.append(agg_result[(current_field + '_missing')])    if len(fields) == 1:        return [            {                current_field: bucket.get('key'),                'doc_count': bucket['doc_count'],            }            for bucket in buckets if bucket['doc_count'] > 0        ]    result = []    for bucket in buckets:        records = get_docs_from_agg_result(bucket, fields[1:], include_missing)        value = bucket.get('key')        for record in records:            record[current_field] = value        result.extend(records)    return result

CodeHunter

ElasticSearch group by multiple fields

Recent Posts

How can I color dots in a xy scatterplot according to column value?

How to update a claim in ASP.NET Identity?

What does {0} mean when initializing an object?

Accessing members of items in a JSONArray with Java

How to log SQL statements in Spring Boot?

Powershell Get-WebSite name parameter is ignored

How to detect scroll to bottom of html element

Java synchronized method

How to test controllers with CodeIgniter?

Detect Visual Composer

Matplotlib: Specify format of floats for tick labels

Rails join a list of strings with commas and "and" before the last