相关度算法

算法介绍

relevance score算法,简单来说,就是计算出,一个索引中的文本,与搜索文本,他们之间的关联匹配程度
Elasticsearch使用的是 term frequency/inverse document frequency算法,简称为TF/IDF算法
Term frequency:搜索文本中的各个词条在field文本中出现了多少次,出现次数越多,就越相关
搜索请求:hello world
doc1:hello you, and world is very good doc2:hello, how are you
Inverse document frequency:搜索文本中的各个词条在整个索引的所有文档中出现了多少次,出现的次数越多,就越不相关
搜索请求:hello world
doc1:hello, today is very good doc2:hi world, how are you
比如说,在index中有1万条document,hello这个单词在所有的document中,一共出现了1000次;world这个单词在所有的document中,一共出现了100次
doc2更相关
Field-length norm:field长度,field越长,相关度越弱
搜索请求:hello world
doc1:{ "title": "hello article", "content": "babaaba 1万个单词" } doc2:{ "title": "my article", "content": "blablabala 1万个单词,hi world" }
hello world在整个index中出现的次数是一样多的
doc1更相关,title field更短

_score是如何被计算出来的

GET /test_index/test_type/_search?explain { "query":{ "match":{ "test_field":"test hello" } } }
 
{ "took":6, "timed_out":false, "_shards":{ "total":5, "successful":5, "failed":0 }, "hits":{ "total":4, "max_score":1.595089, "hits":[ { "_shard":"[test_index][2]", "_node":"4onsTYVZTjGvIj9_spWz2w", "_index":"test_index", "_type":"test_type", "_id":"20", "_score":1.595089, "_source":{ "test_field":"test hello" }, "_explanation":{ "value":1.595089, "description":"sum of:", "details":[ { "value":1.595089, "description":"sum of:", "details":[ { "value":0.58279467, "description":"weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details":[ { "value":0.58279467, "description":"score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details":[ { "value":0.6931472, "description":"idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details":[ { "value":2, "description":"docFreq", "details":[ ] }, { "value":4, "description":"docCount", "details":[ ] } ] }, { "value":0.840795, "description":"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details":[ { "value":1, "description":"termFreq=1.0", "details":[ ] }, { "value":1.2, "description":"parameter k1", "details":[ ] }, { "value":0.75, "description":"parameter b", "details":[ ] }, { "value":1.75, "description":"avgFieldLength", "details":[ ] }, { "value":2.56, "description":"fieldLength", "details":[ ] } ] } ] } ] }, { "value":1.0122943, "description":"weight(test_field:hello in 0) [PerFieldSimilarity], result of:", "details":[ { "value":1.0122943, "description":"score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details":[ { "value":1.2039728, "description":"idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details":[ { "value":1, "description":"docFreq", "details":[ ] }, { "value":4, "description":"docCount", "details":[ ] } ] }, { "value":0.840795, "description":"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details":[ { "value":1, "description":"termFreq=1.0", "details":[ ] }, { "value":1.2, "description":"parameter k1", "details":[ ] }, { "value":0.75, "description":"parameter b", "details":[ ] }, { "value":1.75, "description":"avgFieldLength", "details":[ ] }, { "value":2.56, "description":"fieldLength", "details":[ ] } ] } ] } ] } ] }, { "value":0, "description":"match on required clause, product of:", "details":[ { "value":0, "description":"# clause", "details":[ ] }, { "value":1, "description":"*:*, product of:", "details":[ { "value":1, "description":"boost", "details":[ ] }, { "value":1, "description":"queryNorm", "details":[ ] } ] } ] } ] } }, { "_shard":"[test_index][2]", "_node":"4onsTYVZTjGvIj9_spWz2w", "_index":"test_index", "_type":"test_type", "_id":"6", "_score":0.58279467, "_source":{ "test_field":"tes test" }, "_explanation":{ "value":0.58279467, "description":"sum of:", "details":[ { "value":0.58279467, "description":"sum of:", "details":[ { "value":0.58279467, "description":"weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details":[ { "value":0.58279467, "description":"score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details":[ { "value":0.6931472, "description":"idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details":[ { "value":2, "description":"docFreq", "details":[ ] }, { "value":4, "description":"docCount", "details":[ ] } ] }, { "value":0.840795, "description":"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details":[ { "value":1, "description":"termFreq=1.0", "details":[ ] }, { "value":1.2, "description":"parameter k1", "details":[ ] }, { "value":0.75, "description":"parameter b", "details":[ ] }, { "value":1.75, "description":"avgFieldLength", "details":[ ] }, { "value":2.56, "description":"fieldLength", "details":[ ] } ] } ] } ] } ] }, { "value":0, "description":"match on required clause, product of:", "details":[ { "value":0, "description":"# clause", "details":[ ] }, { "value":1, "description":"*:*, product of:", "details":[ { "value":1, "description":"boost", "details":[ ] }, { "value":1, "description":"queryNorm", "details":[ ] } ] } ] } ] } }, { "_shard":"[test_index][3]", "_node":"4onsTYVZTjGvIj9_spWz2w", "_index":"test_index", "_type":"test_type", "_id":"7", "_score":0.5565415, "_source":{ "test_field":"test client 2" }, "_explanation":{ "value":0.5565415, "description":"sum of:", "details":[ { "value":0.5565415, "description":"sum of:", "details":[ { "value":0.5565415, "description":"weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details":[ { "value":0.5565415, "description":"score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details":[ { "value":0.6931472, "description":"idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details":[ { "value":1, "description":"docFreq", "details":[ ] }, { "value":2, "description":"docCount", "details":[ ] } ] }, { "value":0.8029196, "description":"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details":[ { "value":1, "description":"termFreq=1.0", "details":[ ] }, { "value":1.2, "description":"parameter k1", "details":[ ] }, { "value":0.75, "description":"parameter b", "details":[ ] }, { "value":2.5, "description":"avgFieldLength", "details":[ ] }, { "value":4, "description":"fieldLength", "details":[ ] } ] } ] } ] } ] }, { "value":0, "description":"match on required clause, product of:", "details":[ { "value":0, "description":"# clause", "details":[ ] }, { "value":1, "description":"_type:test_type, product of:", "details":[ { "value":1, "description":"boost", "details":[ ] }, { "value":1, "description":"queryNorm", "details":[ ] } ] } ] } ] } }, { "_shard":"[test_index][1]", "_node":"4onsTYVZTjGvIj9_spWz2w", "_index":"test_index", "_type":"test_type", "_id":"8", "_score":0.25316024, "_source":{ "test_field":"test client 2" }, "_explanation":{ "value":0.25316024, "description":"sum of:", "details":[ { "value":0.25316024, "description":"sum of:", "details":[ { "value":0.25316024, "description":"weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details":[ { "value":0.25316024, "description":"score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details":[ { "value":0.2876821, "description":"idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details":[ { "value":1, "description":"docFreq", "details":[ ] }, { "value":1, "description":"docCount", "details":[ ] } ] }, { "value":0.88, "description":"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details":[ { "value":1, "description":"termFreq=1.0", "details":[ ] }, { "value":1.2, "description":"parameter k1", "details":[ ] }, { "value":0.75, "description":"parameter b", "details":[ ] }, { "value":3, "description":"avgFieldLength", "details":[ ] }, { "value":4, "description":"fieldLength", "details":[ ] } ] } ] } ] } ] }, { "value":0, "description":"match on required clause, product of:", "details":[ { "value":0, "description":"# clause", "details":[ ] }, { "value":1, "description":"*:*, product of:", "details":[ { "value":1, "description":"boost", "details":[ ] }, { "value":1, "description":"queryNorm", "details":[ ] } ] } ] } ] } } ] } }
 
3、分析一个document是如何被匹配上的
GET /test_index/test_type/6/_explain { "query": { "match": { "test_field": "test hello" } } }