vearch / vearch

Distributed vector search for AI-native applications

Home Page:https://vearch.github.io

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

gamma报错导致ps宕机

gtf520 opened this issue · comments

commented

vearch docker 3.2.7 版本
配置如下:
[global]
# the name will validate join cluster by same name
name = "zycfc-vearch02"
# you data save to disk path ,If you are in a production environment, You'd better set absolute paths
data = ["/data/vearch/data"]
# log path , If you are in a production environment, You'd better set absolute paths
log = "/data/vearch/logs"
# default log type for any model
level = "debug"
# master <-> ps <-> router will use this key to send or receive data
signkey = "vearch-ai"
skip_auth = true
# tell Vearch whether it should manage it's own instance of etcd or not
# self_manage_etcd = false
# automatically remove the failed node and recover when new nodes join
# auto_recover_ps = false
# support access etcd basic auth,depend on self_manage_etcd = true
# support_etcd_auth = false
# ensure leader-follow raft data synchronization is consistent
raft_consistent = true

if you are master, you'd better set all config for router、ps and router, ps use default config it so cool

[[masters]]
# name machine name for cluster
name = "m1"
# ip or domain
address = "50.16.3.26"
# api port for http server
api_port = 8817
# port for etcd server
etcd_port = 2378
# listen_peer_urls List of comma separated URLs to listen on for peer traffic.
# advertise_peer_urls List of this member's peer URLs to advertise to the rest of the cluster. The URLs needed to be a comma-separated list.
etcd_peer_port = 2390
# List of this member's client URLs to advertise to the public.
# The URLs needed to be a comma-separated list.
# advertise_client_urls AND listen_client_urls
etcd_client_port = 2370
pprof_port = 6062
# monitor
monitor_port = 8818

[router]
# port for server
port = 9001

[ps]
# port for server
rpc_port = 8081
# raft config begin
ps_heartbeat_timeout = 5 #seconds
raft_heartbeat_port = 8898
raft_replicate_port = 8899
heartbeat-interval = 2000 #ms
raft_retain_logs = 200000000
raft_replica_concurrency = 1
raft_snap_concurrency = 1
raft_truncate_count = 100000000
#when behind leader this value,will stop the server for search
raft_diff_count = 100000
# engine config
engine_dwpt_num = 8
# max size byte
# max_size = 50000000
pprof_port = 6061
# if set true , this ps only use in db meta config
private = false
# seconds
flush_time_interval = 600
flush_count_threshold = 10000

报错日志:

INFO 2023-07-21 03:55:14,593 gamma_index_ivfpq.cc:611 update index success! size=56, total=1479
INFO 2023-07-21 03:55:14,722 gamma_index_ivfpq.cc:611 update index success! size=52, total=2048
INFO 2023-07-21 03:55:14,754 gamma_index_ivfpq.cc:611 update index success! size=53, total=1474
INFO 2023-07-21 03:55:14,760 gamma_index_ivfpq.cc:611 update index success! size=53, total=1525
INFO 2023-07-21 03:55:14,766 gamma_index_ivfpq.cc:611 update index success! size=52, total=1505
INFO 2023-07-21 03:55:15,082 gamma_index_ivfpq.cc:611 update index success! size=54, total=1787
INFO 2023-07-21 03:55:15,194 gamma_index_ivfpq.cc:611 update index success! size=53, total=2019
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261518 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261519 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261520 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261521 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261522 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261523 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261524 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261525 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261526 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261527 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261528 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261529 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261530 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261531 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261532 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261533 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261534 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261535 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261536 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261537 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261538 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261539 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261540 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261541 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261542 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261543 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261544 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261545 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261546 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261547 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261548 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261549 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,214 bitmap_manager.cc:100 parameters error, begin_bit_id=1261550 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261551 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261552 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261553 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261554 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261555 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261556 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261557 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261558 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261559 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261560 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261561 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261562 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261563 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261564 dump_bit_len=1 bit_size=1250000
ERROR 2023-07-21 03:55:15,215 bitmap_manager.cc:100 parameters error, begin_bit_id=1261565 dump_bit_len=1 bit_size=1250000

commented

这个代码逻辑看着有点奇怪,是直接拉的镜像吗?docker pull vearch/vearch:3.2.7

是直接拉的docker镜像,集群一共3台ps进程,每个ps进程目前都出现了这个问题。这个异常信息一直在快速刷。

commented

麻烦贴一下具体操作呢?看日志是bitmap dump的时候size不对,看能不能复现下

现在服务一直在大量刷这个错。服务正常运行的时候出现这个问题,ps容器出现了宕机,重启ps容器后,raft目录有45G,启动很慢,删除了raft目录,再次进行了重启。这个报错还有。

业务使用时,有根据索引删除数据的操作,跟这个有关系吗?

commented

可以最新的镜像v3.3.0,delete_by_query接口之前的版本确实有问题

确定时v3.3.0的镜像吗?docker pull vearch/vearch:3.3.0 ,我尝试拉没拉到,我看项目最新tag是 3.2.10

目前集群重启过一会就会出现异常,一直在刷上面的异常日志,我把调delete_by_query接口的表直接删除能恢复集群吗?

commented

目前集群重启过一会就会出现异常,一直在刷上面的异常日志,我把调delete_by_query接口的表直接删除能恢复集群吗?

可以先这么恢复,后续可以考虑升级镜像v3.3.0

3.3版本有什么修改,有文档吗?我们了解一下。

commented

可以看一下3.3.0的release note