elasticsearch tokenize "H&R Blocks" as "H", "R", "H&R", "Blocks"
Try using the word_delimiter
token filter.
Reading the docs on its use you an set the parameter preserve_original: true
to do exactly what you want (i.e. "H&R" => H&R
H
R
).
I would set it up like this:
"settings" : { "analysis" : { "filter" : { "special_character_spliter" : { "type" : "word_delimiter", "preserve_original": "true" } }, "analyzer" : { "your_analyzer" : { "type" : "custom", "tokenizer" : "whitespace", "filter" : ["lowercase", "special_character_spliter"] } } }}
Good luck!
"settings" : { "analysis" : { "filter" : { "blocks_filter" : { "type" : "word_delimiter", "preserve_original": "true" }, "shingle":{ "type":"shingle", "max_shingle_size":5, "min_shingle_size":2, "output_unigrams":"true" }, "filter_stop":{ "type":"stop", "enable_position_increments":"false" } }, "analyzer" : { "blocks_analyzer" : { "type" : "custom", "tokenizer" : "whitespace", "filter" : ["lowercase", "blocks_filter", "shingle"] } } }},"mappings" : { "type" : { "properties" : { "company" : { "type" : "string", "analyzer" : "blocks_analyzer" } } }}