elasticsearch tokenize "H&R Blocks" as "H", "R", "H&R", "Blocks" elasticsearch tokenize "H&R Blocks" as "H", "R", "H&R", "Blocks" elasticsearch elasticsearch

elasticsearch tokenize "H&R Blocks" as "H", "R", "H&R", "Blocks"


Try using the word_delimiter token filter.

Reading the docs on its use you an set the parameter preserve_original: true to do exactly what you want (i.e. "H&R" => H&R H R).

I would set it up like this:

"settings" : {    "analysis" : {        "filter" : {            "special_character_spliter" : {                "type" : "word_delimiter",                "preserve_original": "true"            }           },        "analyzer" : {            "your_analyzer" : {                "type" : "custom",                "tokenizer" : "whitespace",                "filter" : ["lowercase", "special_character_spliter"]            }        }    }}

Good luck!


"settings" : {    "analysis" : {       "filter" : {           "blocks_filter" : {               "type" : "word_delimiter",               "preserve_original": "true"           },          "shingle":{              "type":"shingle",              "max_shingle_size":5,              "min_shingle_size":2,              "output_unigrams":"true"           },           "filter_stop":{              "type":"stop",              "enable_position_increments":"false"           }       },       "analyzer" : {           "blocks_analyzer" : {               "type" : "custom",               "tokenizer" : "whitespace",               "filter" : ["lowercase", "blocks_filter", "shingle"]           }       }   }},"mappings" : {   "type" : {       "properties" : {           "company" : {               "type" : "string",               "analyzer" : "blocks_analyzer"           }       }   }}