Elasticsearch: custom tokenizer split by words and dots Elasticsearch: custom tokenizer split by words and dots elasticsearch elasticsearch

Elasticsearch: custom tokenizer split by words and dots


You don't need two different analyzers for this. There's another solution using shingles and it goes this way:

First you need to create an index with the proper analyzer, which I called domain_shingler:

PUT dev_threats{  "settings": {    "analysis": {      "analyzer": {        "domain_shingler": {          "type": "custom",          "tokenizer": "dot_tokenizer",          "filter": [            "shingles",            "joiner"          ]        }      },      "tokenizer": {        "dot_tokenizer": {          "type": "char_group",          "tokenize_on_chars": [            "punctuation"          ]        }      },      "filter": {        "shingles": {          "type": "shingle",          "min_shingle_size": 2,          "max_shingle_size": 4,          "output_unigrams": true        },        "joiner": {          "type": "pattern_replace",          "pattern": """\s""",          "replacement": "."        }      }    }  },  "mappings": {    "properties": {      "domain": {        "type": "text",        "analyzer": "domain_shingler",        "search_analyzer": "standard"      }    }  }}

If you try to analyze some.test.domain.com with that analyzer, you'll get the following tokens:

POST dev_threats/_analyze{  "analyzer": "domain_shingler",  "text": "some.test.domain.com"}

Results:

{  "tokens" : [    {      "token" : "some",      "start_offset" : 0,      "end_offset" : 4,      "type" : "word",      "position" : 0    },    {      "token" : "some.test",      "start_offset" : 0,      "end_offset" : 9,      "type" : "shingle",      "position" : 0,      "positionLength" : 2    },    {      "token" : "some.test.domain",      "start_offset" : 0,      "end_offset" : 16,      "type" : "shingle",      "position" : 0,      "positionLength" : 3    },    {      "token" : "some.test.domain.com",      "start_offset" : 0,      "end_offset" : 20,      "type" : "shingle",      "position" : 0,      "positionLength" : 4    },    {      "token" : "test",      "start_offset" : 5,      "end_offset" : 9,      "type" : "word",      "position" : 1    },    {      "token" : "test.domain",      "start_offset" : 5,      "end_offset" : 16,      "type" : "shingle",      "position" : 1,      "positionLength" : 2    },    {      "token" : "test.domain.com",      "start_offset" : 5,      "end_offset" : 20,      "type" : "shingle",      "position" : 1,      "positionLength" : 3    },    {      "token" : "domain",      "start_offset" : 10,      "end_offset" : 16,      "type" : "word",      "position" : 2    },    {      "token" : "domain.com",      "start_offset" : 10,      "end_offset" : 20,      "type" : "shingle",      "position" : 2,      "positionLength" : 2    },    {      "token" : "com",      "start_offset" : 17,      "end_offset" : 20,      "type" : "word",      "position" : 3    }  ]}


You can use path hierarchy tokenizer

PUT my-index{  "settings": {    "analysis": {      "analyzer": {        "custom_path_tree": {          "tokenizer": "custom_hierarchy"        },        "custom_path_tree_reversed": {          "tokenizer": "custom_hierarchy_reversed"        }      },      "tokenizer": {        "custom_hierarchy": {          "type": "path_hierarchy",          "delimiter": "."        },        "custom_hierarchy_reversed": {          "type": "path_hierarchy",          "delimiter": ".",          "reverse": "true"        }      }    }  }}POST my-index/_analyze{  "analyzer": "custom_path_tree",  "text": "some.test.domain.com"}POST my-index/_analyze{  "analyzer": "custom_path_tree_reversed",  "text": "some.test.domain.com"}

** Result**

  "tokens" : [    {      "token" : "some",      "start_offset" : 0,      "end_offset" : 4,      "type" : "word",      "position" : 0    },    {      "token" : "some.test",      "start_offset" : 0,      "end_offset" : 9,      "type" : "word",      "position" : 0    },    {      "token" : "some.test.domain",      "start_offset" : 0,      "end_offset" : 16,      "type" : "word",      "position" : 0    },    {      "token" : "some.test.domain.com",      "start_offset" : 0,      "end_offset" : 20,      "type" : "word",      "position" : 0    }  ]}{  "tokens" : [    {      "token" : "some.test.domain.com",      "start_offset" : 0,      "end_offset" : 20,      "type" : "word",      "position" : 0    },    {      "token" : "test.domain.com",      "start_offset" : 5,      "end_offset" : 20,      "type" : "word",      "position" : 0    },    {      "token" : "domain.com",      "start_offset" : 10,      "end_offset" : 20,      "type" : "word",      "position" : 0    },    {      "token" : "com",      "start_offset" : 17,      "end_offset" : 20,      "type" : "word",      "position" : 0    }  ]}

It will create path like tokens by splitting on given delimiter. Using normal and reverse option you can get tokens in both directions