1. 下载插件
从 GitHub 下载Elasticsearch Vector Scoring插件。
2. 安装插件
将插件文件放置在 elasticsearch/plugins
目录下。安装后的目录结构如下:
plugins
└── vector
├── elasticsearch-binary-vector-scoring-5.6.9.jar
└── plugin-descriptor.properties
修改 plugin-descriptor.properties
文件中的 elasticsearch.version
为 5.6.15(根据使用的 Elasticsearch 版本)。安装完成后,重启 Elasticsearch 服务。
3. 构建测试索引
使用以下命令创建一个名为 vector_test
的索引:
PUT /vector_test
{
"settings": {
"index": {
"number_of_shards": 3,
"number_of_replicas": 0
}
},
"mappings": {
"resume": {
"dynamic": "strict",
"properties": {
"file_hash": {
"type": "keyword"
},
"embedding_vector": {
"type": "binary",
"doc_values": true
},
"doc": {
"type": "text"
}
}
}
}
}
4. 构建测试数据
使用以下 Python 代码生成向量的 base64 字符串:
import base64
import numpy as np
dfloat32 = np.dtype('>f4')
def decode_float_list(base64_string):
bytes = base64.b64decode(base64_string)
return np.frombuffer(bytes, dtype=dfloat32).tolist()
def encode_array(arr):
base64_str = base64.b64encode(np.array(arr).astype(dfloat32)).decode("utf-8")
return base64_str
print(encode_array([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]))
print(encode_array([0.001,0.002,0.003,0.004,0.005,0.006,0.007,0.008,0.009,0.010]))
将生成的 base64 字符串作为 embedding_vector
字段的值,插入到 Elasticsearch 中的 vector_test
索引:
PUT /vector_test/resume/1
{
"file_hash": "hash1",
"embedding_vector": "PczMzT5MzM0+mZmaPszMzT8AAAA/GZmaPzMzMz9MzM0/ZmZmP4AAAA==",
"doc": "This is the content of the first document."
}
PUT /vector_test/resume/2
{
"file_hash": "hash2",
"embedding_vector": "OoMSbzsDEm87RJumO4MSbzuj1wo7xJumO+VgQjwDEm88E3S8PCPXCg==",
"doc": "This is the content of the second document."
}
5. 查询测试
使用以下查询语句搜索匹配的向量:
POST /vector_test/resume/_search
{
"query": {
"function_score": {
"boost_mode": "replace",
"script_score": {
"script": {
"source": "binary_vector_score",
"lang": "knn",
"params": {
"cosine": true,
"field": "embedding_vector",
"vector": [
1.0, 0.8, 0.2223, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1
]
}
}
}
}
},
"size": 2,
"_source": [
"file_hash"
]
}
查询结果如下:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 4,
"max_score": 0.998783,
"hits": [
{
"_index": "vector_test",
"_type": "resume",
"_id": "4",
"_score": 0.998783,
"_source": {
"file_hash": "hash4"
}
},
{
"_index": "vector_test",
"_type": "resume",
"_id": "1",
"_score": 0.5818508,
"_source": {
"file_hash": "hash1"
}
}
]
}
}