分词插件
1.analysis-smartcn v2.1.1
2.analysis-mmseg v1.7.0
3.analysis-ik v7.6.2 首选
4.analysis-stconvert v1.6.1
5.analysis-pinyin v1.5.2
elasticsearch.yml配置
# 集群名 #cluster.name: "cn-out-of-box" # 节点名 #node.name: "node1" # 是否有资格成为主节点 #node.master: true # 是否存储索引数据 #node.data: true # 默认索引分片数 #index.number_of_shards: 3 # 默认索引副本数 #index.number_of_replicas: 1 # 临时文件存储路路径 #path.work: "/tmp/elasticsearch" # 日志文件存储路路径 #path.logs: "/var/log/elasticsearch/logs" # tcp传输端口 #transport.tcp.port: 9300 # 是否压缩tcp传输数据 transport.tcp.compress: true # http端口 #http.port: 9200 # 是否开启http服务 #http.enabled: true # 是否打开多播发现节点 #discovery.zen.ping.multicast.enabled: true #http.max_content_length: 100mb # 本地文件系统 #gateway.type: local # Settings below control how and when to start the initial recovery process on # a full cluster restart (to reuse as much local data as possible when using shared # gateway). # 设置N个节点启动是进行数据恢复 #gateway.recover_after_nodes: 1 # 初始化恢复进程的超时时间 #gateway.recover_after_time: 5m # 集群数量, 全部节点启动就会进行数据恢复 #gateway.expected_nodes: 2 # 初始化数据恢复的并发线程数 #cluster.routing.allocation.node_initial_primaries_recoveries: 4 #添加删除节点或负载均衡时并发恢复线程的个数 #cluster.routing.allocation.node_concurrent_recoveries: 2 #设置数据恢复时限制的带宽 , 0为无限制 #indices.recovery.max_bytes_per_sec: 20mb #限制从其它分片恢复数据时最大同时打开并发流的个数 #indices.recovery.concurrent_streams: 5 # 慢查询日志参数 #index.search.slowlog.threshold.query.warn: 10s #index.search.slowlog.threshold.query.info: 5s #index.search.slowlog.threshold.query.debug: 2s #index.search.slowlog.threshold.query.trace: 500ms #index.search.slowlog.threshold.fetch.warn: 1s #index.search.slowlog.threshold.fetch.info: 800ms #index.search.slowlog.threshold.fetch.debug: 500ms #index.search.slowlog.threshold.fetch.trace: 200ms index.version.created: xxx # 索引配置 index: # 分析配置 analysis: # 分词器配置 tokenizer: # ======== analysis-pinyin ======== # 完整拼音 my_pinyin: type: pinyin first_letter: prefix padding_char: ' ' # 拼音首字母 pinyin_first_letter: type: pinyin first_letter: only # ======== analysis-mmseg ======== # 简单正向匹配 # example: 一个人使劲儿的说话 # 一个人 # 一个劲 # 一个劲儿 # 一个劲儿的 # mmseg_simple: # type: mmseg # seg_type: simple # 匹配出所有的“三个词的词组” # 并使用四种规则消歧(最大匹配、最大平均词语长度、词语长度的最小变化率、所有单字词词频的自然对数之和) # example: 研究生命起源 # 研_究_生 # 研_究_生命 # 研究生_命_起源 # 研究_生命_起源 # mmseg_complex: # type: mmseg # seg_type: complex # 基于complex的最多分词 # example: 中国人民银行 # 中国|人民|银行 # mmseg_maxword: # type: mmseg # seg_type: max_word # ======== analysis-stconvert ======== # 简繁转换,只输出繁体 # s2t_convert: # type: stconvert # delimiter: "," # convert_type: s2t # # 繁简转换,只输出简体 # t2s_convert: # type: stconvert # delimiter: "," # convert_type: t2s # 简繁转换,同时输出繁体简体 # s2t_keep_both_convert: # type: stconvert # delimiter: "," # keep_both: 'true' # convert_type: s2t # 繁简转换,同时输出简体繁体 # t2s_keep_both_convert: # type: stconvert # delimiter: "," # keep_both: 'true' # convert_type: t2s # ======== analysis-pattern ======== # 正则,分号分词 semicolon_spliter: type: pattern pattern: ";" # 正则,%分词 pct_spliter: type: pattern pattern: "[%]+" # ======== analysis-nGram ======== # 1~2字为一词 ngram_1_to_2: type: nGram min_gram: 1 max_gram: 2 # 1~3字为一词 ngram_1_to_3: type: nGram min_gram: 1 max_gram: 3 # 过滤器配置 filter: # ======== ngram filter ======== ngram_min_3: max_gram: 10 min_gram: 3 type: nGram ngram_min_2: max_gram: 10 min_gram: 2 type: nGram ngram_min_1: max_gram: 10 min_gram: 1 type: nGram # ======== length filter ======== min2_length: min: 2 max: 4 type: length min3_length: min: 3 max: 4 type: length # ======== pinyin filter ======== pinyin_first_letter: type: pinyin first_letter: only # 分析器配置 analyzer: lowercase_keyword: type: custom filter: - lowercase tokenizer: standard lowercase_keyword_ngram_min_size1: type: custom filter: - lowercase - stop - trim - unique tokenizer: nGram lowercase_keyword_ngram_min_size2: type: custom filter: - lowercase - min2_length - stop - trim - unique tokenizer: nGram lowercase_keyword_ngram_min_size3: type: custom filter: - lowercase - min3_length - stop - trim - unique tokenizer: ngram_1_to_3 lowercase_keyword_ngram: type: custom filter: - lowercase - stop - trim - unique tokenizer: ngram_1_to_3 lowercase_keyword_without_standard: type: custom filter: - lowercase tokenizer: keyword lowercase_whitespace: type: custom filter: - lowercase tokenizer: whitespace # ======== ik ======== # ik分词器 # ik: # alias: # - ik_analyzer # type: ik_smart # ik最细粒度切分 「三只老鼠在挖洞」会被拆分为「三只老鼠在挖洞、三只老鼠、三只、老鼠、挖洞」 # ik_max_word: # type: custom # use_smart: false # ik智能切分 「三只老鼠在挖洞」会被拆分为「三只老鼠、老鼠在挖洞」 # ik_smart: # type: custom # use_smart: true # ======== mmseg ======== # mmseg分词器 mmseg_maxword: type: custom filter: - lowercase tokenizer: mmseg_maxword mmseg_complex: type: custom filter: - lowercase tokenizer: mmseg_complex mmseg_simple: type: custom filter: - lowercase tokenizer: mmseg_simple # ======== 正则 ======== comma_spliter: type: pattern pattern: "[,|\\s]+" pct_spliter: type: pattern pattern: "[%]+" custom_snowball_analyzer: type: snowball language: English simple_english_analyzer: type: custom tokenizer: whitespace filter: - standard - lowercase - snowball edge_ngram: type: custom tokenizer: edgeNGram filter: - lowercase # ======== 拼音分析 ======== pinyin_ngram_analyzer: type: custom tokenizer: my_pinyin filter: - lowercase - nGram - trim - unique # ======== 拼音首字母分词 ======== pinyin_first_letter_analyzer: type: custom tokenizer: pinyin_first_letter filter: - standard - lowercase # ======== 拼音首字母分词并过滤 ======== pinyin_first_letter_keyword_analyzer: alias: - pinyin_first_letter_analyzer_keyword type: custom tokenizer: keyword filter: - pinyin_first_letter - lowercase # ======== 简繁体 ======== # stconvert: # alias: # - st_analyzer # type: stconvert # s2t_convert: # type: stconvert # delimiter: "," # convert_type: s2t # t2s_convert: # type: stconvert # delimiter: "," # convert_type: t2s # s2t_keep_both_convert: # type: stconvert # delimiter: "," # keep_both: 'true' # convert_type: s2t # t2s_keep_both_convert: # type: stconvert # delimiter: "," # keep_both: 'true' # convert_type: t2s # 路径分析 path_analyzer: type: custom tokenizer: path_hierarchy uax_url_email: tokenizer: uax_url_email filter: [standard, lowercase, stop] # 线程池设置 #threadpool: # index: # type: fixed # size: 30 # queue: -1 # reject_policy: caller
注意:本文归作者所有,未经作者允许,不得转载