Elasticsearch highlight matches in HTML without breaking syntax Elasticsearch highlight matches in HTML without breaking syntax elasticsearch elasticsearch

Elasticsearch highlight matches in HTML without breaking syntax


One way to achieve this is to use the html_strip char filter while analyzing preview_html field.
This would ensure that while matches would not occur on html markup and hence highlighting would ignore it to as shown in the example below.

Example:

put test{   "settings": {      "index": {         "analysis": {            "char_filter": {               "my_html": {                  "type": "html_strip"               }            },            "analyzer": {               "my_html": {                  "tokenizer": "standard",                  "char_filter": [                     "my_html"                  ],                  "type": "custom"               }            }         }      }   }}put test/test/_mapping{   "properties": {      "preview_html": {         "type": "string",         "analyzer": "my_html",         "search_analyzer": "standard"      }   }}put test/test/1{    "preview_html": "<p> p </p>"}post test/test/_search{   "query": {      "match": {         "preview_html": "p"      }   },   "highlight": {      "fields": {         "preview_html": {}      }   }}

Results

 "hits": [         {            "_index": "test",            "_type": "test",            "_id": "1",            "_score": 0.30685282,            "_source": {               "preview_html": "<p> p </p>"            },            "highlight": {               "preview_html": [                  "<p> <em>p</em> </p>"               ]            }         }      ]