Skip to content

Commit b3de99e

Browse files
authored
[test]Add method to analyze chaos test result (milvus-io#26724)
Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
1 parent 09218bf commit b3de99e

12 files changed

Lines changed: 340 additions & 70 deletions

File tree

tests/benchmark/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
# --extra-index-url https://test.pypi.org/simple/
33
# pymilvus==2.0.0rc3.dev8
44

5-
grpcio==1.37.1
5+
grpcio==1.53.0
66
grpcio-testing==1.37.1
77
grpcio-tools==1.37.1
88

99
pandas==1.1.5
10-
scipy==1.3.1
10+
scipy==1.10.0
1111
scikit-learn==0.19.1
1212
h5py==2.7.1
1313
# influxdb==5.2.2

tests/python_client/chaos/checker.py

Lines changed: 219 additions & 25 deletions
Large diffs are not rendered by default.

tests/python_client/chaos/conftest.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@ def pytest_addoption(parser):
99
parser.addoption("--target_number", action="store", default="1", help="target_number")
1010
parser.addoption("--chaos_duration", action="store", default="1m", help="chaos_duration")
1111
parser.addoption("--chaos_interval", action="store", default="10s", help="chaos_interval")
12-
parser.addoption("--request_duration", action="store", default="3m", help="request_duration")
13-
parser.addoption("--is_check", action="store", type=bool, default=False, help="is_check")
12+
parser.addoption("--request_duration", action="store", default="5m", help="request_duration")
13+
parser.addoption("--is_check", action="store", type=bool, default=False, help="is_check")
14+
parser.addoption("--wait_signal", action="store", type=bool, default=True, help="wait_signal")
1415

1516

1617
@pytest.fixture
@@ -56,3 +57,8 @@ def request_duration(request):
5657
@pytest.fixture
5758
def is_check(request):
5859
return request.config.getoption("--is_check")
60+
61+
62+
@pytest.fixture
63+
def wait_signal(request):
64+
return request.config.getoption("--wait_signal")

tests/python_client/chaos/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,5 @@
2222
WAIT_PER_OP = 10 # time to wait in seconds between operations
2323
CHAOS_DURATION = 120 # chaos duration time in seconds
2424
DEFAULT_INDEX_PARAM = {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 48, "efConstruction": 500}}
25-
DEFAULT_SEARCH_PARAM = {"metric_type": "L2", "params": {"ef": 64}}
25+
DEFAULT_SEARCH_PARAM = {"metric_type": "L2", "params": {"ef": 64}}
26+
CHAOS_INFO_SAVE_PATH = "/tmp/ci_logs/chaos_info.json"
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
# for test result anaylszer
3+
prettytable==3.8.0
4+
pyarrow==11.0.0
5+
fastparquet==2023.7.0

tests/python_client/chaos/test_chaos_apply.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@
33
import time
44
from time import sleep
55
from pathlib import Path
6+
import json
67
from pymilvus import connections
78
from common.cus_resource_opts import CustomResourceOperations as CusResource
89
from common.milvus_sys import MilvusSys
9-
import logging as log
10+
from utils.util_log import test_log as log
11+
from datetime import datetime
1012
from utils.util_k8s import wait_pods_ready, get_milvus_instance_name, get_milvus_deploy_tool
11-
from utils.util_common import update_key_value, update_key_name, gen_experiment_config
13+
from utils.util_common import update_key_value, update_key_name, gen_experiment_config, wait_signal_to_apply_chaos
1214
import constants
1315

1416

@@ -54,9 +56,17 @@ def teardown(self):
5456
chaos_res.delete(meta_name, raise_ex=False)
5557
sleep(2)
5658

57-
def test_chaos_apply(self, chaos_type, target_component, target_number, chaos_duration, chaos_interval):
59+
def test_chaos_apply(self, chaos_type, target_component, target_number, chaos_duration, chaos_interval, wait_signal):
5860
# start the monitor threads to check the milvus ops
5961
log.info("*********************Chaos Test Start**********************")
62+
if wait_signal:
63+
log.info("need wait signal to start chaos")
64+
ready_for_chaos = wait_signal_to_apply_chaos()
65+
if not ready_for_chaos:
66+
log.info("did not get the signal to apply chaos")
67+
raise Exception
68+
else:
69+
log.info("get the signal to apply chaos")
6070
log.info(connections.get_connection_addr('default'))
6171
release_name = self.release_name
6272
chaos_config = gen_experiment_config(
@@ -88,6 +98,7 @@ def test_chaos_apply(self, chaos_type, target_component, target_number, chaos_du
8898
version=constants.CHAOS_VERSION,
8999
namespace=constants.CHAOS_NAMESPACE)
90100
chaos_res.create(chaos_config)
101+
create_time = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f')
91102
log.info("chaos injected")
92103
res = chaos_res.list_all()
93104
chaos_list = [r['metadata']['name'] for r in res['items']]
@@ -97,6 +108,7 @@ def test_chaos_apply(self, chaos_type, target_component, target_number, chaos_du
97108
sleep(chaos_duration)
98109
# delete chaos
99110
chaos_res.delete(meta_name)
111+
delete_time = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f')
100112
log.info("chaos deleted")
101113
res = chaos_res.list_all()
102114
chaos_list = [r['metadata']['name'] for r in res['items']]
@@ -114,6 +126,18 @@ def test_chaos_apply(self, chaos_type, target_component, target_number, chaos_du
114126
log.info("all pods are ready")
115127
pods_ready_time = time.time() - t0
116128
log.info(f"pods ready time: {pods_ready_time}")
129+
recovery_time = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f')
130+
event_records = {
131+
"chaos_type": chaos_type,
132+
"target_component": target_component,
133+
"meta_name": meta_name,
134+
"create_time": create_time,
135+
"delete_time": delete_time,
136+
"recovery_time": recovery_time
137+
}
138+
# save event records to json file
139+
with open(constants.CHAOS_INFO_SAVE_PATH, 'w') as f:
140+
json.dump(event_records, f)
117141
# reconnect to test the service healthy
118142
start_time = time.time()
119143
end_time = start_time + 120
@@ -125,5 +149,6 @@ def test_chaos_apply(self, chaos_type, target_component, target_number, chaos_du
125149
log.error(e)
126150
sleep(2)
127151
recovery_time = time.time() - start_time
128-
log.info(f"recovery time: {recovery_time}")
152+
log.info(f"recovery time from pod ready to can be connected: {recovery_time}")
153+
129154
log.info("*********************Chaos Test Completed**********************")

tests/python_client/chaos/testcases/test_concurrent_operation.py

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import threading
1+
import time
22
import pytest
33
import json
44
from time import sleep
@@ -9,40 +9,20 @@
99
SearchChecker,
1010
QueryChecker,
1111
DeleteChecker,
12-
Op)
13-
from common.cus_resource_opts import CustomResourceOperations as CusResource
12+
Op,
13+
ResultAnalyzer
14+
)
15+
from utils.util_k8s import wait_pods_ready, get_milvus_instance_name
1416
from utils.util_log import test_log as log
1517
from chaos import chaos_commons as cc
1618
from common import common_func as cf
19+
from common.milvus_sys import MilvusSys
1720
from chaos.chaos_commons import assert_statistic
1821
from common.common_type import CaseLabel
1922
from chaos import constants
2023
from delayed_assert import expect, assert_expectations
2124

2225

23-
def assert_statistic(checkers, expectations={}):
24-
for k in checkers.keys():
25-
# expect succ if no expectations
26-
succ_rate = checkers[k].succ_rate()
27-
total = checkers[k].total()
28-
average_time = checkers[k].average_time
29-
if 'compact' in str(k):
30-
log.info("skip compact check")
31-
log.info(
32-
f"Expect Succ: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
33-
continue
34-
if expectations.get(k, '') == constants.FAIL:
35-
log.info(
36-
f"Expect Fail: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
37-
expect(succ_rate < 0.49 or total < 2,
38-
f"Expect Fail: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
39-
else:
40-
log.info(
41-
f"Expect Succ: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
42-
expect(succ_rate > 0.90 and total > 2,
43-
f"Expect Succ: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
44-
45-
4626
def get_all_collections():
4727
try:
4828
with open("/tmp/ci_logs/all_collections.json", "r") as f:
@@ -70,7 +50,7 @@ class TestBase:
7050
class TestOperations(TestBase):
7151

7252
@pytest.fixture(scope="function", autouse=True)
73-
def connection(self, host, port, user, password):
53+
def connection(self, host, port, user, password, milvus_ns):
7454
if user and password:
7555
# log.info(f"connect to {host}:{port} with user {user} and password {password}")
7656
connections.connect('default', host=host, port=port, user=user, password=password, secure=True)
@@ -82,7 +62,10 @@ def connection(self, host, port, user, password):
8262
self.host = host
8363
self.port = port
8464
self.user = user
85-
self.password = password
65+
self.password = password
66+
self.milvus_sys = MilvusSys(alias='default')
67+
self.milvus_ns = milvus_ns
68+
self.release_name = get_milvus_instance_name(self.milvus_ns, milvus_sys=self.milvus_sys)
8669

8770
def init_health_checkers(self, collection_name=None):
8871
c_name = collection_name
@@ -91,7 +74,7 @@ def init_health_checkers(self, collection_name=None):
9174
Op.flush: FlushChecker(collection_name=c_name),
9275
Op.search: SearchChecker(collection_name=c_name),
9376
Op.query: QueryChecker(collection_name=c_name),
94-
Op.compact:CompactChecker(collection_name=c_name),
77+
Op.compact: CompactChecker(collection_name=c_name),
9578
Op.delete: DeleteChecker(collection_name=c_name),
9679
}
9780
self.health_checkers = checkers
@@ -107,11 +90,14 @@ def test_operations(self, request_duration, is_check, collection_name):
10790
# start the monitor threads to check the milvus ops
10891
log.info("*********************Test Start**********************")
10992
log.info(connections.get_connection_addr('default'))
93+
# event_records = EventRecords()
11094
c_name = collection_name if collection_name else cf.gen_unique_str("Checker_")
95+
# event_records.insert("init_health_checkers", "start")
11196
self.init_health_checkers(collection_name=c_name)
97+
# event_records.insert("init_health_checkers", "finished")
11298
cc.start_monitor_threads(self.health_checkers)
11399
log.info("*********************Load Start**********************")
114-
request_duration = request_duration.replace("h","*3600+").replace("m","*60+").replace("s","")
100+
request_duration = request_duration.replace("h", "*3600+").replace("m", "*60+").replace("s", "")
115101
if request_duration[-1] == "+":
116102
request_duration = request_duration[:-1]
117103
request_duration = eval(request_duration)
@@ -120,7 +106,11 @@ def test_operations(self, request_duration, is_check, collection_name):
120106
for k, v in self.health_checkers.items():
121107
v.check_result()
122108
# log.info(v.check_result())
109+
wait_pods_ready(self.milvus_ns, f"app.kubernetes.io/instance={self.release_name}")
110+
time.sleep(60)
111+
ra = ResultAnalyzer()
112+
ra.get_stage_success_rate()
123113
if is_check:
124114
assert_statistic(self.health_checkers)
125-
assert_expectations()
115+
assert_expectations()
126116
log.info("*********************Chaos Test Completed**********************")

tests/python_client/chaos/testcases/test_single_request_operation.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,15 @@
1111
IndexChecker,
1212
DeleteChecker,
1313
DropChecker,
14-
Op)
14+
Op,
15+
EventRecords,
16+
ResultAnalyzer
17+
)
1518
from utils.util_log import test_log as log
1619
from utils.util_k8s import wait_pods_ready, get_milvus_instance_name
1720
from chaos import chaos_commons as cc
1821
from common.common_type import CaseLabel
22+
from common.milvus_sys import MilvusSys
1923
from chaos.chaos_commons import assert_statistic
2024
from chaos import constants
2125
from delayed_assert import assert_expectations
@@ -50,6 +54,7 @@ def connection(self, host, port, user, password, milvus_ns):
5054
self.port = port
5155
self.user = user
5256
self.password = password
57+
self.milvus_sys = MilvusSys(alias='default')
5358
self.milvus_ns = milvus_ns
5459
self.release_name = get_milvus_instance_name(self.milvus_ns, milvus_sys=self.milvus_sys)
5560

@@ -72,8 +77,11 @@ def test_operations(self, request_duration, is_check):
7277
# start the monitor threads to check the milvus ops
7378
log.info("*********************Test Start**********************")
7479
log.info(connections.get_connection_addr('default'))
80+
event_records = EventRecords()
7581
c_name = None
82+
event_records.insert("init_health_checkers", "start")
7683
self.init_health_checkers(collection_name=c_name)
84+
event_records.insert("init_health_checkers", "finished")
7785
cc.start_monitor_threads(self.health_checkers)
7886
log.info("*********************Load Start**********************")
7987
# wait request_duration
@@ -83,6 +91,9 @@ def test_operations(self, request_duration, is_check):
8391
request_duration = eval(request_duration)
8492
for i in range(10):
8593
sleep(request_duration // 10)
94+
# add an event so that the chaos can start to apply
95+
if i == 3:
96+
event_records.insert("init_chaos", "ready")
8697
for k, v in self.health_checkers.items():
8798
v.check_result()
8899
if is_check:
@@ -91,4 +102,9 @@ def test_operations(self, request_duration, is_check):
91102
# wait all pod ready
92103
wait_pods_ready(self.milvus_ns, f"app.kubernetes.io/instance={self.release_name}")
93104
time.sleep(60)
105+
for k, v in self.health_checkers.items():
106+
v.pause()
107+
ra = ResultAnalyzer()
108+
ra.get_stage_success_rate()
109+
ra.show_result_table()
94110
log.info("*********************Chaos Test Completed**********************")
Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
--extra-index-url https://test.pypi.org/simple/
22
docker==5.0.0
3-
grpcio==1.37.1
3+
grpcio==1.53.0
44
grpcio-tools==1.37.1
5-
pymilvus==2.0.0rc8
5+
pymilvus==2.0.0rc8
6+
7+
# for test result anaylszer
8+
prettytable==3.8.0
9+
pyarrow==11.0.0
10+
fastparquet==2023.7.0

tests/python_client/requirements.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,15 @@ Faker==19.2.0
3939
h5py==3.8.0
4040

4141
# for log
42-
loguru==0.6.0
42+
loguru==0.7.0
4343

4444
# util
4545
psutil==5.9.4
46+
pandas==1.5.3
4647
# for standby test
4748
etcd-sdk-python==0.0.2
49+
50+
# for test result anaylszer
51+
prettytable==3.8.0
52+
pyarrow==11.0.0
53+
fastparquet==2023.7.0

0 commit comments

Comments
 (0)