Skip to content

Commit 8528bcf

Browse files
committed
full loop
1 parent 2245a87 commit 8528bcf

61 files changed

Lines changed: 46672 additions & 918 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

codes/.DS_Store

6 KB
Binary file not shown.
7.99 KB
Binary file not shown.

codes/graphBuild/dg_utils.py

Lines changed: 374 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,374 @@
1+
import numpy as np
2+
import json
3+
import os
4+
import shutil
5+
import random
6+
7+
8+
def ClearDir(dirpath):
9+
if os.path.exists(dirpath):
10+
print("正在删除.....", dirpath)
11+
shutil.rmtree(path=dirpath)
12+
os.makedirs(dirpath)
13+
14+
15+
def GetFilesIn(dir):
16+
res = []
17+
for dir, _, files in os.walk(dir):
18+
for file in files:
19+
res.append(dir + file)
20+
return res
21+
22+
23+
def GetGraphIDFromPath(path):
24+
print(path)
25+
ID = path.split(".")[-2].split("_")[-1]
26+
return int(ID)
27+
28+
29+
def GetPointSets(pts_size,
30+
num_clusters,
31+
half_space_dist_,
32+
dim,
33+
gauss_delta_,
34+
_means=np.array([])): # 获得最初的point set(包含num_clusters个簇,每个簇
35+
_points = []
36+
_labels = []
37+
# 生成一个多维高斯分布
38+
for i in range(num_clusters):
39+
if _means.shape[0] == 0:
40+
_mean = np.random.uniform(-half_space_dist_, half_space_dist_,
41+
(dim)) # z
42+
else:
43+
_mean = _means[i]
44+
45+
_cov = np.diag(np.array([gauss_delta_ for i in range(dim)])) # 方差0.5
46+
_points.append(
47+
np.random.multivariate_normal(size=pts_size // num_clusters,
48+
mean=_mean,
49+
cov=_cov))
50+
_labels.append([i for t in range(pts_size // num_clusters)])
51+
52+
p = np.array(_points).reshape((pts_size, dim))
53+
l = np.array(_labels).reshape((pts_size, 1))
54+
55+
state = np.random.get_state()
56+
np.random.shuffle(p)
57+
np.random.set_state(state)
58+
np.random.shuffle(l)
59+
return p, l
60+
61+
62+
def GenDistubIds(pts_size, keep_ratio):
63+
ids = range(0, pts_size)
64+
65+
# keep_ids 相似性的点
66+
# dist_ids 不相似的点
67+
keep_ids = random.sample(ids, int(keep_ratio * pts_size))
68+
keep_ids.sort()
69+
dist_ids = [i for i in ids if i not in keep_ids]
70+
71+
return keep_ids, dist_ids
72+
73+
# keep_ids, dist_ids = GenDistubIds(pts_size, keep_ratio)
74+
75+
76+
# disturb given points
77+
def DisturbPoints(inputs, dim, keep_ids, dist_ids, disturb_dist, HARD_MOVE):
78+
output = inputs
79+
80+
for i in dist_ids:
81+
if HARD_MOVE == True:
82+
_moveVec = np.tile(disturb_dist, dim)
83+
else:
84+
_moveVec = np.random.uniform(-disturb_dist, disturb_dist, (dim))
85+
86+
output[i] += _moveVec
87+
return output, keep_ids
88+
89+
90+
# disturb clusters of given label
91+
def DisturbClusters(inputs, dim, labels, disturb_label, pts_size, disturb_dist,
92+
HARD_MOVE):
93+
ids = range(0, pts_size)
94+
# keep_ids 相似性的点
95+
# dist_ids 不相似的点
96+
dist_ids = [i for i in range(len(labels)) if labels[i] == disturb_label]
97+
keep_ids = [i for i in ids if i not in dist_ids]
98+
99+
return DisturbPoints(inputs, dim, keep_ids, dist_ids, disturb_dist,
100+
HARD_MOVE)
101+
102+
103+
# translate the clusters
104+
def TranslateClusters(inputs, dim, labels, disturb_labels, pts_size,
105+
disturb_dist, HARD_MOVE):
106+
outputs = inputs
107+
108+
ids = range(0, pts_size)
109+
# dist_ids 不相似的点
110+
dist_ids = [i for i in range(len(labels)) if labels[i] in disturb_labels]
111+
# keep_ids 相似性的点
112+
keep_ids = [i for i in ids if i not in dist_ids]
113+
114+
if HARD_MOVE == True:
115+
_moveVec = np.tile(disturb_dist, dim)
116+
else:
117+
_moveVec = np.random.uniform(-disturb_dist, disturb_dist, (dim))
118+
119+
# the same displacement
120+
for i in dist_ids:
121+
outputs[i] += _moveVec
122+
return outputs, keep_ids
123+
124+
125+
# disturb point sets from different clusters
126+
def DisturbPointSets(inputs, labels, disturb_label_num, keep_ratio):
127+
assert (disturb_label_num > 0 and disturb_label_num <= num_clusters)
128+
129+
output = inputs
130+
ids = range(0, pts_size)
131+
# keep_ids 相似性的点
132+
# dist_ids 不相似的点
133+
disturb_labels = random.sample(range(num_clusters), disturb_label_num)
134+
135+
num_each_cluster = int((1. - keep_ratio) * pts_size / disturb_label_num)
136+
137+
# for each cluster disturb the same number of
138+
dist_ids = {}
139+
for label in disturb_labels:
140+
label_ids = [i for i in range(len(labels)) if labels[i] == label]
141+
dist_ids[label] = random.sample(label_ids, num_each_cluster)
142+
143+
keep_ids = []
144+
for i in ids:
145+
flag = True
146+
for label in disturb_labels:
147+
if i in dist_ids[label]:
148+
flag = False
149+
if flag == True:
150+
keep_ids.append(i)
151+
152+
# ...
153+
for label in disturb_labels:
154+
if HARD_MOVE_ == True:
155+
_moveVec = np.tile(disturb_dist, dim) * random.sample([-1, 1],
156+
1)[0]
157+
else:
158+
_moveVec = np.random.uniform(-disturb_dist, disturb_dist, (dim))
159+
160+
# the same displacement
161+
for i in dist_ids[label]:
162+
output[i] += _moveVec
163+
164+
return output, keep_ids
165+
166+
167+
def shiftAllPoints(inputs, dim, disturb_dist, HARD_MOVE_):
168+
if HARD_MOVE_ == True:
169+
_moveVec = np.tile(disturb_dist, dim) * random.sample([-1, 1], 1)[0]
170+
else:
171+
_moveVec = np.random.uniform(-disturb_dist, disturb_dist, (dim))
172+
173+
output = inputs
174+
for i in range(output.shape[0]):
175+
output[i] += _moveVec
176+
177+
# no keeping id
178+
return output, []
179+
180+
181+
# add noise to undisturbed points
182+
def addNoise(inputs, dim, dist_ids, noise_intensity):
183+
outputs = inputs
184+
for id in dist_ids:
185+
moveVec = np.random.uniform(-noise_intensity, noise_intensity, (dim))
186+
outputs[id] += moveVec
187+
188+
print("add noise to current data")
189+
return outputs
190+
191+
192+
def overlapClusters(inputs, pts_size, dim, means, labels, merge_labels):
193+
output = inputs
194+
ids = range(0, pts_size)
195+
196+
merge_mean = np.zeros((dim))
197+
# 计算出要合并的几个簇中心的中心
198+
for label in merge_labels:
199+
merge_mean += means[label]
200+
merge_mean /= len(merge_labels)
201+
202+
for id in ids:
203+
# move the cluster center to the same location
204+
if labels[id] in merge_labels:
205+
# print(output[id, :].shape)
206+
# print(merge_mean.shape)
207+
# print(means[labels[id], :].shape)
208+
output[id, :] += merge_mean - means[labels[id], :].reshape(dim, )
209+
210+
return output
211+
212+
213+
# 缩小整个类簇而不改变knn
214+
def scaleCluster(inputs,
215+
pts_size,
216+
dim,
217+
labels,
218+
shrink_label,
219+
new_center,
220+
scale_factor=0.25):
221+
output = inputs
222+
ids = range(0, pts_size)
223+
224+
# 先算出该类簇中心
225+
scale_center = cluster_center(inputs=inputs,
226+
labels=labels,
227+
label=shrink_label,
228+
pts_size=pts_size,
229+
dim=dim)
230+
231+
# 将所有点围绕该中心进行缩放,并移动到
232+
shrink_ids = [id for id in ids if labels[id] == shrink_label]
233+
for id in shrink_ids:
234+
output[id] = new_center + (output[id] - scale_center) * scale_factor
235+
236+
return output
237+
238+
239+
def cluster_center(inputs, pts_size, dim, labels, label):
240+
ids = range(0, pts_size)
241+
242+
center = np.zeros((dim))
243+
count = 0
244+
for id in ids:
245+
if labels[id] == label:
246+
center += inputs[id, :]
247+
count += 1
248+
249+
center /= float(count)
250+
return center
251+
252+
253+
def splitClusters(inputs, pts_size, dim, labels, split_label, disturb_dist):
254+
output = inputs
255+
ids = range(0, pts_size)
256+
257+
split_ids = [id for id in ids if labels[id] == split_label]
258+
259+
split_ids_0 = random.sample(split_ids, int(len(split_ids) / 2))
260+
split_ids_1 = [id for id in split_ids if id not in split_ids_0]
261+
262+
_moveVec = np.tile(-disturb_dist, dim)
263+
for id in split_ids_0:
264+
output[id] += _moveVec
265+
266+
_moveVec = np.tile(disturb_dist, dim)
267+
for id in split_ids_1:
268+
output[id] += _moveVec
269+
270+
return output
271+
272+
273+
def DistOfEdges(dists, indices):
274+
E = {}
275+
for i in range(indices.shape[0]):
276+
vi = i
277+
for j in range(indices[i].shape[0]):
278+
vj = indices[i][j]
279+
if vi != vj:
280+
E[(vi, vj)] = dists[i][j]
281+
return E
282+
283+
284+
# save points and labels into file
285+
def savetxt(filepath, cur_points, labels):
286+
with open(filepath, 'w') as f:
287+
for i in range(cur_points.shape[0]):
288+
fstr = ""
289+
for j in range(cur_points.shape[1]):
290+
fstr += "%.16f\t" % (cur_points[i][j])
291+
fstr += "%d\n" % (labels[i])
292+
f.write(fstr)
293+
294+
print(os.path.abspath(filepath) + " saved.")
295+
296+
297+
def queryLabelIds(ids, labels, queryLabels=[]):
298+
return [id for id in ids if labels[id] in queryLabels]
299+
300+
301+
def DistOfEdges(dists, indices):
302+
E = {}
303+
for i in range(indices.shape[0]):
304+
vi = i
305+
for j in range(indices[i].shape[0]):
306+
vj = indices[i][j]
307+
if vi != vj:
308+
E[(vi, vj)] = dists[i][j]
309+
return E
310+
311+
312+
def writeInfo(filepath, info):
313+
with open(filepath, 'w', encoding='utf-8') as f:
314+
json.dump(info, f)
315+
316+
317+
# # Amplify the cluster
318+
# def DiffuseCluster(inputs, labels, disturb_label):
319+
# output = inputs
320+
# assert(disturb_label>=0 and disturb_label<=num_clusters)
321+
322+
# # compute the centroid of the cluster
323+
# disturb_ids = [i for i in range(len(labels)) if labels[i] == disturb_label]
324+
# disturb_inputs = [inputs[id] for id in disturb_ids]
325+
# disturb_inputs = np.array(disturb_ids)
326+
327+
# centroid = np.sum(disturb_ids, 0) / disturb_ids.shape(0)
328+
329+
# # for each point, compute the vector
330+
# for i in disturb_inputs.shape[0]:
331+
# # each point go through the vector
332+
# output[i] +=
333+
334+
# return output
335+
336+
# # random select keeping edges. NEVER USE IT
337+
# def DisturbEdges(inputs, kd_tree, keep_ratio=0.7):
338+
# outputs = inputs
339+
# dists, indices = kd_tree.query(
340+
# inputs, k=k_closest_count) # 一口气对所有points构建knn
341+
# edge_size = indices.shape[0]*(k_closest_count-1)
342+
343+
# np.arange(0, points.shape[0])
344+
# ids = range(0, pts_size)
345+
346+
# # keep_edges 相似性的边
347+
# keep_edges = []
348+
# while len(keep_edges) != edge_size*keep_ratio:
349+
# keep_ids_0 = np.random.randint(0, pts_size)
350+
# keep_ids_1 = np.random.randint(0, pts_size)
351+
# # ensure no self-loop and duplicate
352+
# if keep_ids_0 == keep_ids_1 or [keep_ids_0, keep_ids_1] in keep_edges:
353+
# continue
354+
# keep_edges.append([keep_ids_0, keep_ids_1])
355+
356+
# keep_edges.sort()
357+
# # dist_edges 不相似的边
358+
# dist_edges = []
359+
# for i in range(len(indices)):
360+
# for j in indices[i]:
361+
# # print(i)
362+
# # print(j)
363+
# if [i, j] not in keep_edges:
364+
# dist_edges.append([i, j])
365+
366+
# # disturb edge endpoints
367+
# for [i, j] in dist_edges:
368+
# _moveVec_i = np.random.uniform(-0.5, 0.5, (dim))
369+
# _moveVec_j = np.random.uniform(-0.5, 0.5, (dim))
370+
371+
# outputs[i] += _moveVec_i
372+
# outputs[j] += _moveVec_j
373+
374+
# return outputs, keep_edges

0 commit comments

Comments
 (0)