ElasticSearch group by multiple fields
Starting from version 1.0 of ElasticSearch
, the new aggregations API allows grouping by multiple fields, using sub-aggregations. Suppose you want to group by fields field1
, field2
and field3
:
{ "aggs": { "agg1": { "terms": { "field": "field1" }, "aggs": { "agg2": { "terms": { "field": "field2" }, "aggs": { "agg3": { "terms": { "field": "field3" } } } } } } }}
Of course this can go on for as many fields as you'd like.
Update:
For completeness, here is how the output of the above query looks. Also below is python code for generating the aggregation query and flattening the result into a list of dictionaries.
{ "aggregations": { "agg1": { "buckets": [{ "doc_count": <count>, "key": <value of field1>, "agg2": { "buckets": [{ "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, { "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, ... ] }, { "doc_count": <count>, "key": <value of field1>, "agg2": { "buckets": [{ "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, { "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, ... ] }, ... ] } }}
The following python code performs the group-by given the list of fields. I you specify include_missing=True
, it also includes combinations of values where some of the fields are missing (you don't need it if you have version 2.0 of Elasticsearch thanks to this)
def group_by(es, fields, include_missing): current_level_terms = {'terms': {'field': fields[0]}} agg_spec = {fields[0]: current_level_terms} if include_missing: current_level_missing = {'missing': {'field': fields[0]}} agg_spec[fields[0] + '_missing'] = current_level_missing for field in fields[1:]: next_level_terms = {'terms': {'field': field}} current_level_terms['aggs'] = { field: next_level_terms, } if include_missing: next_level_missing = {'missing': {'field': field}} current_level_terms['aggs'][field + '_missing'] = next_level_missing current_level_missing['aggs'] = { field: next_level_terms, field + '_missing': next_level_missing, } current_level_missing = next_level_missing current_level_terms = next_level_terms agg_result = es.search(body={'aggs': agg_spec})['aggregations'] return get_docs_from_agg_result(agg_result, fields, include_missing)def get_docs_from_agg_result(agg_result, fields, include_missing): current_field = fields[0] buckets = agg_result[current_field]['buckets'] if include_missing: buckets.append(agg_result[(current_field + '_missing')]) if len(fields) == 1: return [ { current_field: bucket.get('key'), 'doc_count': bucket['doc_count'], } for bucket in buckets if bucket['doc_count'] > 0 ] result = [] for bucket in buckets: records = get_docs_from_agg_result(bucket, fields[1:], include_missing) value = bucket.get('key') for record in records: record[current_field] = value result.extend(records) return result
As you only have 2 fields a simple way is doing two queries with single facets. For Male:
{ "query" : { "term" : { "gender" : "Male" } }, "facets" : { "age_range" : { "terms" : { "field" : "age_range" } } }}
And for female:
{ "query" : { "term" : { "gender" : "Female" } }, "facets" : { "age_range" : { "terms" : { "field" : "age_range" } } }}
Or you can do it in a single query with a facet filter (see this link for further information)
{ "query" : { "match_all": {} }, "facets" : { "age_range_male" : { "terms" : { "field" : "age_range" }, "facet_filter":{ "term": { "gender": "Male" } } }, "age_range_female" : { "terms" : { "field" : "age_range" }, "facet_filter":{ "term": { "gender": "Female" } } } }}
Update:
As facets are about to be removed. This is the solution with aggregations:
{ "query": { "match_all": {} }, "aggs": { "male": { "filter": { "term": { "gender": "Male" } }, "aggs": { "age_range": { "terms": { "field": "age_range" } } } }, "female": { "filter": { "term": { "gender": "Female" } }, "aggs": { "age_range": { "terms": { "field": "age_range" } } } } }}
The aggregations API allows grouping by multiple fields, using sub-aggregations. Suppose you want to group by fields field1
, field2
and field3
:
{ "aggs": { "agg1": { "terms": { "field": "field1" }, "aggs": { "agg2": { "terms": { "field": "field2" }, "aggs": { "agg3": { "terms": { "field": "field3" } } } } } } }}
Of course this can go on for as many fields as you'd like.
Update:
For completeness, here is how the output of the above query looks. Also below is python code for generating the aggregation query and flattening the result into a list of dictionaries.
{ "aggregations": { "agg1": { "buckets": [{ "doc_count": <count>, "key": <value of field1>, "agg2": { "buckets": [{ "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, { "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, ... ] }, { "doc_count": <count>, "key": <value of field1>, "agg2": { "buckets": [{ "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, { "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, ... ] }, ... ] } }}
The following python code performs the group-by given the list of fields. I you specify include_missing=True
, it also includes combinations of values where some of the fields are missing (you don't need it if you have version 2.0 of Elasticsearch thanks to this)
def group_by(es, fields, include_missing): current_level_terms = {'terms': {'field': fields[0]}} agg_spec = {fields[0]: current_level_terms} if include_missing: current_level_missing = {'missing': {'field': fields[0]}} agg_spec[fields[0] + '_missing'] = current_level_missing for field in fields[1:]: next_level_terms = {'terms': {'field': field}} current_level_terms['aggs'] = { field: next_level_terms, } if include_missing: next_level_missing = {'missing': {'field': field}} current_level_terms['aggs'][field + '_missing'] = next_level_missing current_level_missing['aggs'] = { field: next_level_terms, field + '_missing': next_level_missing, } current_level_missing = next_level_missing current_level_terms = next_level_terms agg_result = es.search(body={'aggs': agg_spec})['aggregations'] return get_docs_from_agg_result(agg_result, fields, include_missing)def get_docs_from_agg_result(agg_result, fields, include_missing): current_field = fields[0] buckets = agg_result[current_field]['buckets'] if include_missing: buckets.append(agg_result[(current_field + '_missing')]) if len(fields) == 1: return [ { current_field: bucket.get('key'), 'doc_count': bucket['doc_count'], } for bucket in buckets if bucket['doc_count'] > 0 ] result = [] for bucket in buckets: records = get_docs_from_agg_result(bucket, fields[1:], include_missing) value = bucket.get('key') for record in records: record[current_field] = value result.extend(records) return result