Uploaded image for project: 'Ibexa IBX'
  1. Ibexa IBX
  2. IBX-8104

Document how to configure elastic search with analyzers for different languages

Details

    • Yes

    Description

      There us some documentation about how to customize Elasticsearch templates in the docs, but it doesn't mention at all how to configure the stemming and language-specific analyzing in a multi-language setup.

      The typical use-cases should be covered:

      • stemming
      • stop-words

       

       

       

      An example on how to configure English and Norwegian:

      # config/packages/ibexa_elasticsearch.yaml 
      # Base configuration for Elasticsearch
      parameters:
          env(ELASTICSEARCH_URL): '%env(ELASTICSEARCH_DSN)%'
          elasticsearch_dsn: "%env(ELASTICSEARCH_URL)%"
      
      ibexa_elasticsearch:
          connections:
              default:
                  hosts:
                      - "%elasticsearch_dsn%"
                  index_templates:
                      - norwegian
                      - english
                  debug: true
                  trace: true
      
          document_group_resolver: 'Ibexa\Elasticsearch\ElasticSearch\Index\Group\LanguageGroupResolver'
      
          index_templates:
              norwegian:
                  # "order" not supported:
                  #order: 10
                  patterns:
                      - "*_nor_no*"
                  settings:
                      analysis:
                          normalizer:
                              lowercase_normalizer:
                                  type: custom
                                  char_filter: []
                                  filter:
                                      - lowercase
                          analyzer:
                              ibexa_spellcheck_analyzer:
                                  type: custom
                                  tokenizer: lowercase
                                  filter:
                                      - lowercase
                                      - norwegian_stop
                                      - norwegian_keywords
                                      - norwegian_stemmer
                                      - ibexa_spellcheck_shingle_filter
                              ibexa_spellcheck_raw_analyzer:
                                  type: custom
                                  tokenizer: standard
                                  filter:
                                      - lowercase
                                      - norwegian_stop
                                      - norwegian_keywords
                                      - norwegian_stemmer
                          filter:
                              ibexa_spellcheck_shingle_filter:
                                  type: shingle
                                  min_shingle_size: 2
                                  max_shingle_size: 3
                              norwegian_stop:
                                  type: stop
                                  stopwords: "_norwegian_"
                              norwegian_keywords:
                                  type: keyword_marker
                                  keywords: []
                              norwegian_stemmer:
                                  type: stemmer
                                  # https://www.elastic.co/guide/en/elasticsearch/reference/7.17/analysis-stemmer-tokenfilter.html#analysis-stemmer-tokenfilter-configure-parms
                                  # language: norwegian
                                  language: light_norwegian
                      refresh_interval: "-1"
                      index:
                          mapping:
                              total_fields:
                                  limit: 5000
                  mappings:
                      dynamic_templates:
                          -   ez_int:
                                  match: "*_i"
                                  mapping:
                                      type: integer
                          -   ez_mint:
                                  match: "*_mi"
                                  mapping:
                                      type: integer
                          -   ez_id:
                                  match: "*_id"
                                  mapping:
                                      type: keyword
                          -   ez_mid:
                                  match: "*_mid"
                                  mapping:
                                      type: keyword
                          -   ez_string:
                                  match: "*_s"
                                  mapping:
                                      type: keyword
                                      normalizer: lowercase_normalizer
                          -   ez_mstring:
                                  match: "*_ms"
                                  mapping:
                                      type: keyword
                                      normalizer: lowercase_normalizer
                          -   ez_long:
                                  match: "*_l"
                                  mapping:
                                      type: long
                          -   ez_mlong:
                                  match: "*_ml"
                                  mapping:
                                      type: long
                          -   ez_text_fulltext:
                                  match: "*_fulltext"
                                  mapping:
                                      type: text
                                      analyzer: ibexa_spellcheck_analyzer
                          -   ez_text:
                                  match: "*_t"
                                  mapping:
                                      type: text
                                      analyzer: ibexa_spellcheck_analyzer
                          -   ez_boolean:
                                  match: "*_b"
                                  mapping:
                                      type: boolean
                          -   ez_mboolean:
                                  match: "*_mb"
                                  mapping:
                                      type: boolean
                          -   ez_float:
                                  match: "*_f"
                                  mapping:
                                      type: float
                          -   ez_double:
                                  match: "*_d"
                                  mapping:
                                      type: double
                          -   ez_date:
                                  match: "*_dt"
                                  mapping:
                                      type: date
                          -   ez_geolocation:
                                  match: "*_gl"
                                  mapping:
                                      type: geo_point
                          -   ez_spellcheck:
                                  match: "*_spellcheck"
                                  mapping:
                                      type: text
                                      analyzer: ibexa_spellcheck_analyzer
                                      fields:
                                          raw:
                                              type: text
                                              analyzer: ibexa_spellcheck_raw_analyzer
      
              english:
                  patterns:
                      - "*_eng_gb*"
                  settings:
                      analysis:
                          normalizer:
                              lowercase_normalizer:
                                  type: custom
                                  char_filter: []
                                  filter:
                                      - lowercase
                          analyzer:
                              ibexa_spellcheck_analyzer:
                                  type: custom
                                  tokenizer: lowercase
                                  filter:
                                      - lowercase
                                      - english_stop
                                      - english_keywords
                                      - english_stemmer
                                      - english_possessive_stemmer
                                      - ibexa_spellcheck_shingle_filter
                              ibexa_spellcheck_raw_analyzer:
                                  type: custom
                                  tokenizer: standard
                                  filter:
                                      - lowercase
                                      - english_stop
                                      - english_keywords
                                      - english_stemmer
                                      - english_possessive_stemmer
                          filter:
                              ibexa_spellcheck_shingle_filter:
                                  type: shingle
                                  min_shingle_size: 2
                                  max_shingle_size: 3
                              english_stop:
                                  type: stop
                                  stopwords: "_english_"
                              english_keywords:
                                  type: keyword_marker
                                  keywords: []
                              english_stemmer:
                                  type: stemmer
                                  # https://www.elastic.co/guide/en/elasticsearch/reference/7.17/analysis-stemmer-tokenfilter.html#analysis-stemmer-tokenfilter-configure-parms
                                  language: light_english
                              "english_possessive_stemmer": {
                                "type":       "stemmer",
                                "language":   "possessive_english"
                              }
                      refresh_interval: "-1"
                      index:
                          mapping:
                              total_fields:
                                  limit: 5000
                  mappings:
                      dynamic_templates:
                          -   ez_int:
                                  match: "*_i"
                                  mapping:
                                      type: integer
                          -   ez_mint:
                                  match: "*_mi"
                                  mapping:
                                      type: integer
                          -   ez_id:
                                  match: "*_id"
                                  mapping:
                                      type: keyword
                          -   ez_mid:
                                  match: "*_mid"
                                  mapping:
                                      type: keyword
                          -   ez_string:
                                  match: "*_s"
                                  mapping:
                                      type: keyword
                                      normalizer: lowercase_normalizer
                          -   ez_mstring:
                                  match: "*_ms"
                                  mapping:
                                      type: keyword
                                      normalizer: lowercase_normalizer
                          -   ez_long:
                                  match: "*_l"
                                  mapping:
                                      type: long
                          -   ez_mlong:
                                  match: "*_ml"
                                  mapping:
                                      type: long
                          -   ez_text_fulltext:
                                  match: "*_fulltext"
                                  mapping:
                                      type: text
                                      analyzer: ibexa_spellcheck_analyzer
                          -   ez_text:
                                  match: "*_t"
                                  mapping:
                                      type: text
                                      analyzer: ibexa_spellcheck_analyzer
                          -   ez_boolean:
                                  match: "*_b"
                                  mapping:
                                      type: boolean
                          -   ez_mboolean:
                                  match: "*_mb"
                                  mapping:
                                      type: boolean
                          -   ez_float:
                                  match: "*_f"
                                  mapping:
                                      type: float
                          -   ez_double:
                                  match: "*_d"
                                  mapping:
                                      type: double
                          -   ez_date:
                                  match: "*_dt"
                                  mapping:
                                      type: date
                          -   ez_geolocation:
                                  match: "*_gl"
                                  mapping:
                                      type: geo_point
                          -   ez_spellcheck:
                                  match: "*_spellcheck"
                                  mapping:
                                      type: text
                                      analyzer: ibexa_spellcheck_analyzer
                                      fields:
                                          raw:
                                              type: text
                                              analyzer: ibexa_spellcheck_raw_analyzer
      

       

      Designs

        Attachments

          Activity

            People

              vidar.langseid@ibexa.co Vidar Langseid
              vidar.langseid@ibexa.co Vidar Langseid
              Votes:
              1 Vote for this issue
              Watchers:
              3 Start watching this issue

              Dates

                Created:
                Updated: