Elasticsearch: custom tokenizer split by words and dots
You don't need two different analyzers for this. There's another solution using shingles and it goes this way:
First you need to create an index with the proper analyzer, which I called domain_shingler
:
PUT dev_threats{ "settings": { "analysis": { "analyzer": { "domain_shingler": { "type": "custom", "tokenizer": "dot_tokenizer", "filter": [ "shingles", "joiner" ] } }, "tokenizer": { "dot_tokenizer": { "type": "char_group", "tokenize_on_chars": [ "punctuation" ] } }, "filter": { "shingles": { "type": "shingle", "min_shingle_size": 2, "max_shingle_size": 4, "output_unigrams": true }, "joiner": { "type": "pattern_replace", "pattern": """\s""", "replacement": "." } } } }, "mappings": { "properties": { "domain": { "type": "text", "analyzer": "domain_shingler", "search_analyzer": "standard" } } }}
If you try to analyze some.test.domain.com
with that analyzer, you'll get the following tokens:
POST dev_threats/_analyze{ "analyzer": "domain_shingler", "text": "some.test.domain.com"}
Results:
{ "tokens" : [ { "token" : "some", "start_offset" : 0, "end_offset" : 4, "type" : "word", "position" : 0 }, { "token" : "some.test", "start_offset" : 0, "end_offset" : 9, "type" : "shingle", "position" : 0, "positionLength" : 2 }, { "token" : "some.test.domain", "start_offset" : 0, "end_offset" : 16, "type" : "shingle", "position" : 0, "positionLength" : 3 }, { "token" : "some.test.domain.com", "start_offset" : 0, "end_offset" : 20, "type" : "shingle", "position" : 0, "positionLength" : 4 }, { "token" : "test", "start_offset" : 5, "end_offset" : 9, "type" : "word", "position" : 1 }, { "token" : "test.domain", "start_offset" : 5, "end_offset" : 16, "type" : "shingle", "position" : 1, "positionLength" : 2 }, { "token" : "test.domain.com", "start_offset" : 5, "end_offset" : 20, "type" : "shingle", "position" : 1, "positionLength" : 3 }, { "token" : "domain", "start_offset" : 10, "end_offset" : 16, "type" : "word", "position" : 2 }, { "token" : "domain.com", "start_offset" : 10, "end_offset" : 20, "type" : "shingle", "position" : 2, "positionLength" : 2 }, { "token" : "com", "start_offset" : 17, "end_offset" : 20, "type" : "word", "position" : 3 } ]}
You can use path hierarchy tokenizer
PUT my-index{ "settings": { "analysis": { "analyzer": { "custom_path_tree": { "tokenizer": "custom_hierarchy" }, "custom_path_tree_reversed": { "tokenizer": "custom_hierarchy_reversed" } }, "tokenizer": { "custom_hierarchy": { "type": "path_hierarchy", "delimiter": "." }, "custom_hierarchy_reversed": { "type": "path_hierarchy", "delimiter": ".", "reverse": "true" } } } }}POST my-index/_analyze{ "analyzer": "custom_path_tree", "text": "some.test.domain.com"}POST my-index/_analyze{ "analyzer": "custom_path_tree_reversed", "text": "some.test.domain.com"}
** Result**
"tokens" : [ { "token" : "some", "start_offset" : 0, "end_offset" : 4, "type" : "word", "position" : 0 }, { "token" : "some.test", "start_offset" : 0, "end_offset" : 9, "type" : "word", "position" : 0 }, { "token" : "some.test.domain", "start_offset" : 0, "end_offset" : 16, "type" : "word", "position" : 0 }, { "token" : "some.test.domain.com", "start_offset" : 0, "end_offset" : 20, "type" : "word", "position" : 0 } ]}{ "tokens" : [ { "token" : "some.test.domain.com", "start_offset" : 0, "end_offset" : 20, "type" : "word", "position" : 0 }, { "token" : "test.domain.com", "start_offset" : 5, "end_offset" : 20, "type" : "word", "position" : 0 }, { "token" : "domain.com", "start_offset" : 10, "end_offset" : 20, "type" : "word", "position" : 0 }, { "token" : "com", "start_offset" : 17, "end_offset" : 20, "type" : "word", "position" : 0 } ]}
It will create path like tokens by splitting on given delimiter. Using normal and reverse option you can get tokens in both directions