Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2022Tencent Rhino-bird Open-source Training Program—Angel-Shengyang Li —— week2 #1222

Open
Richard3927 opened this issue Aug 7, 2022 · 0 comments

Comments

@Richard3927
Copy link

Richard3927 commented Aug 7, 2022

本周任务:不同层次结构相似度计算,struct2Vec论文学习。

实际情况:

1.上周的优化

由于上周采取的用Windows来跑本地的demo,为了后续的进一步操作,本周采取了搭建虚拟机,用docker来运行项目的方法。遇到的问题有:虚拟机的搭建,环境的基本配置(maven,spark,java等等),git的拉取项目到本地,以及实际运行。由于之前未接触过相关知识与领域,耽误了一定时间。

2.其他,进行了论文的进一步学习。基本分为以下几个步骤。(:可参考 https://zhuanlan.zhihu.com/p/56733145)

a.算法原理:Struc2Vec是从空间结构相似性的角度定义顶点相似度的。

b.定点对距离的定义。

c.构建层次带权图。

d.采样获取顶点序列。

e.优化

3.基本算法实现的学习

参考的链接:https://blog.csdn.net/u012151283/article/details/87255951

a.对距离定义的实现:

def _get_order_degreelist_node(self, root, max_num_layers=None):
    if max_num_layers is None:
        max_num_layers = float('inf')

    ordered_degree_sequence_dict = {}
    visited = [False] * len(self.graph.nodes())
    queue = deque()
    level = 0
    queue.append(root)
    visited[root] = True

    while (len(queue) > 0 and level <= max_num_layers):

        count = len(queue)
        if self.opt1_reduce_len:
            degree_list = {}
        else:
            degree_list = []
        while (count > 0):

            top = queue.popleft()
            node = self.idx2node[top]
            degree = len(self.graph[node])

            if self.opt1_reduce_len:
                degree_list[degree] = degree_list.get(degree, 0) + 1
            else:
                degree_list.append(degree)

            for nei in self.graph[node]:
                nei_idx = self.node2idx[nei]
                if not visited[nei_idx]:
                    visited[nei_idx] = True
                    queue.append(nei_idx)
            count -= 1
        if self.opt1_reduce_len:
            orderd_degree_list = [(degree, freq)
                                  for degree, freq in degree_list.items()]
            orderd_degree_list.sort(key=lambda x: x[0])
        else:
            orderd_degree_list = sorted(degree_list)
        ordered_degree_sequence_dict[level] = orderd_degree_list
        level += 1

    return ordered_degree_sequence_dict
def _compute_ordered_degreelist(self, max_num_layers):

    degreeList = {}
    vertices = self.idx  # self.g.nodes()
    for v in vertices:
        degreeList[v] = self._get_order_degreelist_node(v, max_num_layers)
    return degreeList
def compute_dtw_dist(part_list, degreeList, dist_func):
    dtw_dist = {}
    for v1, nbs in part_list:
        lists_v1 = degreeList[v1]  # lists_v1 :orderd degree list of v1
        for v2 in nbs:
            lists_v2 = degreeList[v2]  # lists_v1 :orderd degree list of v2
            max_layer = min(len(lists_v1), len(lists_v2))  # valid layer
            dtw_dist[v1, v2] = {}
            for layer in range(0, max_layer):
                dist, path = fastdtw(
                    lists_v1[layer], lists_v2[layer], radius=1, dist=dist_func)
                dtw_dist[v1, v2][layer] = dist
    return dtw_dist
def _compute_structural_distance(self, max_num_layers, workers=1, verbose=0,):

    if os.path.exists(self.temp_path+'structural_dist.pkl'):
        structural_dist = pd.read_pickle(
            self.temp_path+'structural_dist.pkl')
    else:
        if self.opt1_reduce_len:
            dist_func = cost_max
        else:
            dist_func = cost

        if os.path.exists(self.temp_path + 'degreelist.pkl'):
            degreeList = pd.read_pickle(self.temp_path + 'degreelist.pkl')
        else:
            degreeList = self._compute_ordered_degreelist(max_num_layers)
            pd.to_pickle(degreeList, self.temp_path + 'degreelist.pkl')

        if self.opt2_reduce_sim_calc:
            degrees = self._create_vectors()
            degreeListsSelected = {}
            vertices = {}
            n_nodes = len(self.idx)
            for v in self.idx:  # c:list of vertex
                nbs = get_vertices(
                    v, len(self.graph[self.idx2node[v]]), degrees, n_nodes)
                vertices[v] = nbs  # store nbs
                degreeListsSelected[v] = degreeList[v]  # store dist
                for n in nbs:
                    # store dist of nbs
                    degreeListsSelected[n] = degreeList[n]
        else:
            vertices = {}
            for v in degreeList:
                vertices[v] = [vd for vd in degreeList.keys() if vd > v]


        results = Parallel(n_jobs=workers, verbose=verbose,)(
            delayed(compute_dtw_dist)(part_list, degreeList, dist_func) for part_list in partition_dict(vertices, workers))
        dtw_dist = dict(ChainMap(*results))

        structural_dist = convert_dtw_struc_dist(dtw_dist)
        pd.to_pickle(structural_dist, self.temp_path +
                     'structural_dist.pkl')

    return structural_dist

b.构建层次带权图

def _get_transition_probs(self, layers_adj, layers_distances):
    layers_alias = {}
    layers_accept = {}

    for layer in layers_adj:

        neighbors = layers_adj[layer]
        layer_distances = layers_distances[layer]
        node_alias_dict = {}
        node_accept_dict = {}
        norm_weights = {}

        for v, neighbors in neighbors.items():
            e_list = []
            sum_w = 0.0

            for n in neighbors:
                if (v, n) in layer_distances:
                    wd = layer_distances[v, n]
                else:
                    wd = layer_distances[n, v]
                w = np.exp(-float(wd))
                e_list.append(w)
                sum_w += w

            e_list = [x / sum_w for x in e_list]
            norm_weights[v] = e_list
            accept, alias = create_alias_table(e_list)
            node_alias_dict[v] = alias
            node_accept_dict[v] = accept

        pd.to_pickle(
            norm_weights, self.temp_path + 'norm_weights_distance-layer-' + str(layer)+'.pkl')

        layers_alias[layer] = node_alias_dict
        layers_accept[layer] = node_accept_dict

    return layers_accept, layers_alias
def prepare_biased_walk(self,):

    sum_weights = {}
    sum_edges = {}
    average_weight = {}
    gamma = {}
    layer = 0
    while (os.path.exists(self.temp_path+'norm_weights_distance-layer-' + str(layer))):
        probs = pd.read_pickle(
            self.temp_path+'norm_weights_distance-layer-' + str(layer))
        for v, list_weights in probs.items():
            sum_weights.setdefault(layer, 0)
            sum_edges.setdefault(layer, 0)
            sum_weights[layer] += sum(list_weights)
            sum_edges[layer] += len(list_weights)

        average_weight[layer] = sum_weights[layer] / sum_edges[layer]

        gamma.setdefault(layer, {})

        for v, list_weights in probs.items():
            num_neighbours = 0
            for w in list_weights:
                if (w > average_weight[layer]):
                    num_neighbours += 1
            gamma[layer][v] = num_neighbours

        layer += 1

    pd.to_pickle(average_weight, self.temp_path + 'average_weight')
    pd.to_pickle(gamma, self.temp_path + 'gamma.pkl')

4.下周目标:结合上周学习,用angel实际实现层次带权图的构建。尝试实现多层次带权网络。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant