From 8a52dd804fdd169cf9f7ca07ca2902d3738e4b0a Mon Sep 17 00:00:00 2001 From: Duy Kha Date: Tue, 29 Jul 2025 22:05:19 -0500 Subject: [PATCH 1/8] yaml config file --- GraphGeneration/encoder.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 GraphGeneration/encoder.yaml diff --git a/GraphGeneration/encoder.yaml b/GraphGeneration/encoder.yaml new file mode 100644 index 0000000..ed82350 --- /dev/null +++ b/GraphGeneration/encoder.yaml @@ -0,0 +1,19 @@ +model: + name: topoGED + node2vec_setup: + node2vec_dimensions: 64 # We add features onto the end since Node2Vec doesn't embed features + node2vec_walk_length: 50 # Number of nodes visited per walk (Higher is more global, smaller is local) + node2vec_num_walks: 10 # Number of walks to start per node (Higher is more detailed and stable) + node2vec_p: 1.0 # Return parameter, the likelihood of revisiting a node (Higher is less backtracking) + node2vec_q: 1.0 # The walk bias for determining direction (Higher is more DFS-like; lower is BFS-like) + node2vec_window: 10 # The context size (Higher is broader learning) + node2vec_min_count: 1 # Minimum number of occurrences for a node to be considered (Higher will ignore more rare nodes) + node2vec_batch_words: 4 # The batch size for when Word2Vec is used (Higher will train faster; but with more memory) + node2vec_workers: 1 # Number of workers (threads) + +training: + batch_size: 64 + lr: 0.001 + epochs: 500 + +dataset: CollegeMsg From e0f28f82612458b9e31ba82c575927db4bde1f4e Mon Sep 17 00:00:00 2001 From: Duy Kha Date: Wed, 30 Jul 2025 12:16:29 -0500 Subject: [PATCH 2/8] use yaml in file --- GraphGeneration/encoder.yaml | 2 + GraphGeneration/scripts/compute_embedding.py | 37 ++++------ GraphGeneration/scripts/topoGED_end_to_end.py | 74 ++++++++++++++----- 3 files changed, 73 insertions(+), 40 deletions(-) diff --git a/GraphGeneration/encoder.yaml b/GraphGeneration/encoder.yaml index ed82350..b701a10 100644 --- a/GraphGeneration/encoder.yaml +++ b/GraphGeneration/encoder.yaml @@ -10,6 +10,7 @@ model: node2vec_min_count: 1 # Minimum number of occurrences for a node to be considered (Higher will ignore more rare nodes) node2vec_batch_words: 4 # The batch size for when Word2Vec is used (Higher will train faster; but with more memory) node2vec_workers: 1 # Number of workers (threads) + hidden_dim: 64 training: batch_size: 64 @@ -17,3 +18,4 @@ training: epochs: 500 dataset: CollegeMsg +seed: 1024 diff --git a/GraphGeneration/scripts/compute_embedding.py b/GraphGeneration/scripts/compute_embedding.py index 0e71d47..26dee0d 100644 --- a/GraphGeneration/scripts/compute_embedding.py +++ b/GraphGeneration/scripts/compute_embedding.py @@ -4,17 +4,11 @@ import torch from node2vec import Node2Vec from GraphGeneration.models.temporal_gnn.script.config import args +import yaml -# Node2Vec Parameters -node2vec_dimensions = args.nfeat # We add features onto the end since Node2Vec doesn't embed features -node2vec_walk_length = 50 # Number of nodes visited per walk (Higher is more global, smaller is local) -node2vec_num_walks = 10 # Number of walks to start per node (Higher is more detailed and stable) -node2vec_p = 1.0 # Return parameter, the likelihood of revisiting a node (Higher is less backtracking) -node2vec_q = 1.0 # The walk bias for determining direction (Higher is more DFS-like; lower is BFS-like) -node2vec_window = 10 # The context size (Higher is broader learning) -node2vec_min_count = 1 # Minimum number of occurrences for a node to be considered (Higher will ignore more rare nodes) -node2vec_batch_words = 4 # The batch size for when Word2Vec is used (Higher will train faster; but with more memory) -node2vec_workers = 1 # Number of workers (threads) +# Load YAML config +with open("GraphGeneration/encoder.yaml", "r") as file: + encoder_config = yaml.safe_load(file) def compute_linear_gnn_embeddings(G: nx.DiGraph, device): """ @@ -61,19 +55,19 @@ def compute_node2vec_embeddings(G: nx.DiGraph, device): """ node2vec = Node2Vec( G, - dimensions=node2vec_dimensions, - walk_length=node2vec_walk_length, - num_walks=node2vec_num_walks, - workers=node2vec_workers, - p=node2vec_p, - q=node2vec_q, + dimensions=encoder_config["model"]["node2vec_setup"]["node2vec_setup"]["node2vec_dimensions"], + walk_length=encoder_config["model"]["node2vec_setup"]["node2vec_walk_length"], + num_walks=encoder_config["model"]["node2vec_setup"]["node2vec_num_walks"], + workers=encoder_config["model"]["node2vec_setup"]["node2vec_workers"], + p=encoder_config["model"]["node2vec_setup"]["node2vec_p"], + q=encoder_config["model"]["node2vec_setup"]["node2vec_q"], quiet=True ) model = node2vec.fit( - window=node2vec_window, - min_count=node2vec_min_count, - batch_words=node2vec_batch_words + window=encoder_config["model"]["node2vec_setup"]["node2vec_window"], + min_count=encoder_config["model"]["node2vec_setup"]["node2vec_min_count"], + batch_words=encoder_config["model"]["node2vec_setup"]["node2vec_batch_words"] ) # Perform Node2Vec # Used to generate an embedding for isolated nodes @@ -110,7 +104,8 @@ def compute_node_embeddings_LSTM(graph_snapshots, lstm_model, device): # Collect per-timestep node embeddings node_history = defaultdict(list) old_nodes = set() - null_embed = torch.tensor([0]*(node2vec_dimensions + node2vec_batch_words), dtype=torch.float32).to(device) + null_embed = torch.tensor([0]*(encoder_config["model"]["node2vec_setup"]["node2vec_dimensions"] + encoder_config["model"]["node2vec_setup"]["node2vec_batch_words"]), + dtype=torch.float32).to(device) for G in graph_snapshots: snapshot_embeddings = compute_node2vec_embeddings(G, device) for node, emb in snapshot_embeddings.items(): @@ -149,7 +144,7 @@ def get_GCN_data(graph_snapshots): x_list = [] edge_index_list = [] - F = node2vec_dimensions # number of features per node (change this if you want more features) + F = encoder_config["model"]["node2vec_setup"]["node2vec_dimensions"] # number of features per node (change this if you want more features) for G in graph_snapshots: node2vec_embeddings = compute_node2vec_embeddings(G) diff --git a/GraphGeneration/scripts/topoGED_end_to_end.py b/GraphGeneration/scripts/topoGED_end_to_end.py index 18243ad..66a37ae 100644 --- a/GraphGeneration/scripts/topoGED_end_to_end.py +++ b/GraphGeneration/scripts/topoGED_end_to_end.py @@ -9,11 +9,13 @@ import pandas as pd import os import sys +import yaml sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) from GraphGeneration.utils.Evaluator import Evaluator from GraphGeneration.models.temporal_gnn.script.config import args from load_data import load_data, generate_training_data_cached, generate_validation_data_cached +from GraphGeneration.utils.casting_type import to_tensor from GraphGeneration.utils.sampling_edges_utils import predict_edges from GraphGeneration.utils.graph_construction_utils import compute_reappearance_probabilities, get_node_features, update_degrees from create_sub_graphs import create_nn_graph, create_on_graph @@ -22,18 +24,13 @@ from GraphGeneration.models.model import setupMLP, load_encoder_model # Import all node embedding methods -from compute_embedding import compute_embedding, node2vec_batch_words +from compute_embedding import compute_embedding from process_data import modifyGraphIds, build_edgebanks_from_start from torch.utils.data import DataLoader # Import Loss fn from GraphGeneration.scripts.composite_graphlet_loss_fn import GraphletLoss -# Set seeds -global_seed = args.seed -random.seed(global_seed) -np.random.seed(global_seed) - # Set up device try: if torch.cuda.is_available(): @@ -46,9 +43,17 @@ device = torch.device("cpu") print("Using CPU") +# Load YAML config +with open("GraphGeneration/encoder.yaml", "r") as file: + encoder_config = yaml.safe_load(file) + +# Set seeds +random.seed(encoder_config["seed"]) +np.random.seed(encoder_config["seed"]) + class Runner(object): def __init__(self): - self.seed = global_seed + self.seed = encoder_config["seed"] # Set up Evaluator self.evaluator = Evaluator() @@ -68,7 +73,8 @@ def __init__(self): self.all_edge_types = ['o-o-bank', 'o-o-nobank', 'o-n', 'n-n'] # Load the global encoder & decoder model - self.encoder_model, input_dim = load_encoder_model(args, device=device, node2vec_dimensions=args.nfeat + node2vec_batch_words, hidden_dim=64) + self.encoder_model, input_dim = load_encoder_model(args, device=device, node2vec_dimensions=encoder_config["model"]["node2vec_setup"]["node2vec_dimensions"] + encoder_config["model"]["node2vec_setup"]["node2vec_batch_words"] + , hidden_dim=encoder_config["model"]["hiden_dim"]) self.link_prediction_decoder = setupMLP(embedding_dim=input_dim*2, embedding=args.embedding, mlpEncoding=args.mlpEncoding, embedOld=args.embedOld) self.link_prediction_decoder.to(device) @@ -89,12 +95,12 @@ def __init__(self): # Convert number of snapshots to integer self.num_snapshots = len(self.probabilities) self.train_end = int(0.8 * self.num_snapshots) - val_end = int(0.9 * self.num_snapshots) + self.val_end = int(0.9 * self.num_snapshots) # Assign snapshots self.training_graphs = [self.target_graphs[i][-1] for i in range(self.train_end)] - self.validation_graphs = [self.target_graphs[i][-1] for i in range(self.train_end, val_end)] - self.test_graphs = [self.target_graphs[i][-1] for i in range(val_end, self.num_snapshots)] + self.validation_graphs = [self.target_graphs[i][-1] for i in range(self.train_end, self.val_end)] + self.test_graphs = [self.target_graphs[i][-1] for i in range(self.val_end, self.num_snapshots)] # ======================= TRAIN MODEL ======================= def train_multi_head(self, training_samples, validation_samples, epochs=250, batch_size=64, training_new_edges_count=0): @@ -137,11 +143,17 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat "old_nodes": set(self.training_graphs[0].nodes()), "new_nodes": set() } + for snapshot in range(1, len(self.training_graphs)): + # Prepare current target old nodes for building graph + self.current_target_old_nodes = set().union(*[g.nodes() for g in self.training_graphs[:snapshot]]) + + # Prepare the edge type counts for current target graph self.current_target_count = { edge_type: self.probabilities[snapshot][j + 2] for j, edge_type in enumerate(self.all_edge_types) } + for flag in self.all_edge_types: curr_X_train = training_samples[flag]['X'][snapshot] curr_y_train = training_samples[flag]['y'][snapshot] @@ -183,23 +195,29 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat if n not in node_embeddings and flag in ['o-n', 'n-n']: node_types["new_nodes"].add(n) + # Assign embeddings for source nodes + # Assign zero vectors for new nodes for on and nn edge type src_embed = torch.stack([ node_embeddings[n] if n in node_embeddings and flag in ['o-n', 'n-n'] else torch.zeros(embed_dim, device=node_embeddings[any_node].device) for n in src_nodes ]) + # Assign embeddings for dest nodes + # Assign zero vectors for new nodes for on and nn edge type dst_embed = torch.stack([ node_embeddings[n] if n in node_embeddings and flag in ['o-n', 'n-n'] else torch.zeros(embed_dim, device=node_embeddings[any_node].device) for n in dst_nodes ]) + # Converting dim if src_embed.dim() == 1: src_embed = src_embed.unsqueeze(1) if dst_embed.dim() == 1: dst_embed = dst_embed.unsqueeze(1) - + + # Get predictions for link preds = self.link_prediction_decoder(src_embed=src_embed, dst_embed=dst_embed, edge_type=flag) if preds.dim() == 0: @@ -218,7 +236,7 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat train_preds.extend(preds.detach().cpu().numpy()) train_labels.extend(y.detach().cpu().numpy()) - # Assign embeddings for all the training_nodes + # Constructing temp graph curr_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=training_graphs, encoder_model=self.encoder_model, device=device) sampled_edges = predict_edges(tmp_graph, edge_type=flag, node_types=node_types, edgebank=self.all_edgebanks[snapshot], link_prediction_decoder=self.link_prediction_decoder, old_node_embeddings=curr_embeddings, top_k=self.current_target_count[flag], graph_num=snapshot, device=device) @@ -235,7 +253,22 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat node_types["old_nodes"].update(self.training_graphs[snapshot].nodes()) # Add the old nodes node_types["new_nodes"] = set() # reset new nodes - + + # Compute the graphlet loss + # Step 1: Constructing the predicted graph + self.current_target_snapshot = snapshot + pred_graph, _ = self.build_accumulating_filtration_sequence_with_edgebank() + pred_graph = pred_graph[-1] + + # Step 2: Computing the graphlet loss + pred_kernel, true_kernel, distance = self.evaluator.evaluateOrca(pred_graph, self.target_graphs[snapshot]) + graphlet_loss = graphlet_loss_fn(to_tensor(pred_kernel).unsqueeze(0), to_tensor(true_kernel).unsqueeze(0)) + graphlet_loss.backward() + optimizer.step() + + + # Validation + for flag in self.all_edge_types: if (epoch + 1) % 100 == 0 or epoch == 0: epochMessage = f"Epoch {epoch+1:02d} | Edge Type: {flag} | Train Loss: {np.mean(train_loss[flag]):.4f} | Train AUCROC {np.mean(train_auc[flag]):.4f}" @@ -263,16 +296,16 @@ def train_models(self): # Prepare training data training_sorted_samples, training_new_edges_count = generate_training_data_cached(training_graphs=self.training_graphs, - all_edgebanks=self.all_edgebanks, MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=global_seed, saved_data_file_path=self.saved_input) + all_edgebanks=self.all_edgebanks, MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=self.seed, saved_data_file_path=self.saved_input) # Prepare validation data # We pass all_edgebanks of the training snapshots edgebanks validation_sorted_samples, training_new_edges_count = generate_validation_data_cached(training_graphs=self.validation_graphs, old_training_nodes=old_training_nodes, - all_edgebanks=self.all_edgebanks[self.train_end], MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=global_seed, type_data="validation", saved_data_file_path=self.saved_input) + all_edgebanks=self.all_edgebanks[self.train_end], MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=self.seed, type_data="validation", saved_data_file_path=self.saved_input) # Prepare test data # We pass all_edgebanks of the training snapshots edgebanks test_sorted_samples, training_new_edges_count = generate_validation_data_cached(training_graphs=self.test_graphs, old_training_nodes=old_training_nodes, - all_edgebanks=self.all_edgebanks[self.train_end], MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=global_seed, type_data="test", saved_data_file_path=self.saved_input) + all_edgebanks=self.all_edgebanks[self.val_end], MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=self.seed, type_data="test", saved_data_file_path=self.saved_input) print('Training') @@ -304,6 +337,9 @@ def build_accumulating_filtration_sequence_with_edgebank(self): edgebank = self.all_edgebanks[self.current_target_snapshot] current_target_graph_description = self.graph_descriptions[self.current_target_snapshot] + # Prepare the graphs we have known + known_graphs = self.training_graphs[:self.current_target_snapshot] + V_total = int(current_target_graph_description[-1][0]) E_total = int(current_target_graph_description[-1][1]) W_total = current_target_graph_description[-1][2] @@ -345,7 +381,7 @@ def build_accumulating_filtration_sequence_with_edgebank(self): get_node_features(tmp_graph, self.thresholds, current_target_graph_description, old_nodes, new_nodes) # Assign embeddings for all the training_nodes - curr_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=self.training_graphs, encoder_model=self.encoder_model, device=device) + curr_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=known_graphs, encoder_model=self.encoder_model, device=device) # Assign zero vector for new nodes for new_node in new_nodes: @@ -362,7 +398,7 @@ def build_accumulating_filtration_sequence_with_edgebank(self): tmp_graph.add_edges_from(sampled_edges) update_degrees(tmp_graph) - new_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=self.training_graphs + [tmp_graph], encoder_model=self.encoder_model, device=device) + new_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=known_graphs + [tmp_graph], encoder_model=self.encoder_model, device=device) curr_embeddings.update(new_embeddings) # Recompute old node embeddings edge_pool = edge_pool.extend(sampled_edges) From 774b74857f30913a2c74d90294fe24e72136e9fb Mon Sep 17 00:00:00 2001 From: Duy Kha Date: Thu, 31 Jul 2025 13:37:01 -0500 Subject: [PATCH 3/8] switch to using yaml --- GraphGeneration/encoder.yaml | 7 ++- GraphGeneration/models/model.py | 28 ++------- GraphGeneration/scripts/compute_embedding.py | 22 +++---- GraphGeneration/scripts/load_data.py | 18 +++--- GraphGeneration/scripts/topoGED_end_to_end.py | 63 ++++++++++--------- .../utils/graph_construction_utils.py | 1 - 6 files changed, 64 insertions(+), 75 deletions(-) diff --git a/GraphGeneration/encoder.yaml b/GraphGeneration/encoder.yaml index b701a10..9174ae8 100644 --- a/GraphGeneration/encoder.yaml +++ b/GraphGeneration/encoder.yaml @@ -1,4 +1,4 @@ -model: +encoder_model: name: topoGED node2vec_setup: node2vec_dimensions: 64 # We add features onto the end since Node2Vec doesn't embed features @@ -11,6 +11,11 @@ model: node2vec_batch_words: 4 # The batch size for when Word2Vec is used (Higher will train faster; but with more memory) node2vec_workers: 1 # Number of workers (threads) hidden_dim: 64 + nodeEmbeddingType: "LSTM" + addOnFeature: "Position" + +decoder_model: + encode_links: "Concat" training: batch_size: 64 diff --git a/GraphGeneration/models/model.py b/GraphGeneration/models/model.py index dc454d1..49a1391 100644 --- a/GraphGeneration/models/model.py +++ b/GraphGeneration/models/model.py @@ -1,14 +1,9 @@ # Models in use -import os -import torch from GraphGeneration.models.MultiHeadedEdgePredictor import MultiHeadedEdgePredictor -from GraphGeneration.models.EdgePredictor import EdgePredictorMLP from GraphGeneration.models.temporal_gnn.script.utils.util import logger -from GraphGeneration.models.temporal_gnn.script.models.HTGN import HTGN -from GraphGeneration.models.GCLSTM import GCLSTM from GraphGeneration.models.SimpleNodeLSTM import SimpleNodeLSTM -def setupMLP(embedding_dim, embedding, mlpEncoding, embedOld): +def setupMLP(embedding_dim, mlpEncoding): """ Set up the MLP based on the arguments provided in the command line starter @@ -21,28 +16,15 @@ def setupMLP(embedding_dim, embedding, mlpEncoding, embedOld): input_dim = embedding_dim # Starting input dimension (two 32-dim node embeddings) # Set up the MLPs according to arguments - if embedOld == 'True': - flags = ['o-o-bank', 'o-o-nobank', 'o-n', 'n-n'] - else: - flags = ['o-o-nobank', 'o-n', 'n-n'] + flags = ['o-o-bank', 'o-o-nobank', 'o-n', 'n-n'] mlp = MultiHeadedEdgePredictor(in_channels=input_dim, hidden_channels=32, edge_types=flags, input_type=mlpEncoding) return mlp -def load_encoder_model(args, device, node2vec_dimensions, hidden_dim=64, HTGN_nodelist=[]): - if args.embeddingType == 'LSTM': +def load_encoder_model(encoder_config, device, node2vec_dimensions, hidden_dim=64, HTGN_nodelist=[]): + if encoder_config["encoder_model"]["nodeEmbeddingType"] == 'LSTM': model = SimpleNodeLSTM(input_dim=node2vec_dimensions, hidden_dim=hidden_dim).to(device) - elif args.embeddingType == 'GCLSTM': - model = GCLSTM(in_channels=node2vec_dimensions, hidden_channels=64).to(device) - model.device = device - elif args.embeddingType == 'HTGN': - args.num_nodes = len(HTGN_nodelist) - args.nfeat = node2vec_dimensions - args.nhid = 64 - args.nout = 64 - model = HTGN(args).to(device) - model.device = device else: raise Exception('pls define the model') - logger.info('using model {} '.format(args.embeddingType)) + logger.info('using model {} '.format(encoder_config["encoder_model"]["nodeEmbeddingType"])) return model, hidden_dim diff --git a/GraphGeneration/scripts/compute_embedding.py b/GraphGeneration/scripts/compute_embedding.py index b97248d..7b41e4a 100644 --- a/GraphGeneration/scripts/compute_embedding.py +++ b/GraphGeneration/scripts/compute_embedding.py @@ -56,19 +56,19 @@ def compute_node2vec_embeddings(G: nx.DiGraph, device): """ node2vec = Node2Vec( G, - dimensions=encoder_config["model"]["node2vec_setup"]["node2vec_dimensions"], - walk_length=encoder_config["model"]["node2vec_setup"]["node2vec_walk_length"], - num_walks=encoder_config["model"]["node2vec_setup"]["node2vec_num_walks"], - workers=encoder_config["model"]["node2vec_setup"]["node2vec_workers"], - p=encoder_config["model"]["node2vec_setup"]["node2vec_p"], - q=encoder_config["model"]["node2vec_setup"]["node2vec_q"], + dimensions=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"], + walk_length=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_walk_length"], + num_walks=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_num_walks"], + workers=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_workers"], + p=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_p"], + q=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_q"], quiet=True ) model = node2vec.fit( - window=encoder_config["model"]["node2vec_setup"]["node2vec_window"], - min_count=encoder_config["model"]["node2vec_setup"]["node2vec_min_count"], - batch_words=encoder_config["model"]["node2vec_setup"]["node2vec_batch_words"] + window=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_window"], + min_count=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_min_count"], + batch_words=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_batch_words"] ) # Perform Node2Vec # Used to generate an embedding for isolated nodes @@ -105,7 +105,7 @@ def compute_node_embeddings_LSTM(graph_snapshots, lstm_model, device): # Collect per-timestep node embeddings node_history = defaultdict(list) old_nodes = set() - null_embed = torch.tensor([0]*(encoder_config["model"]["node2vec_setup"]["node2vec_dimensions"]), + null_embed = torch.tensor([0]*(encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"]), dtype=torch.float32).to(device) for G in graph_snapshots: snapshot_embeddings = compute_node2vec_embeddings(G, device) @@ -146,7 +146,7 @@ def get_GCN_data(graph_snapshots): x_list = [] edge_index_list = [] - F = encoder_config["model"]["node2vec_setup"]["node2vec_dimensions"] # number of features per node (change this if you want more features) + F = encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"] # number of features per node (change this if you want more features) for G in graph_snapshots: node2vec_embeddings = compute_node2vec_embeddings(G) diff --git a/GraphGeneration/scripts/load_data.py b/GraphGeneration/scripts/load_data.py index 5bffaea..3042b7f 100644 --- a/GraphGeneration/scripts/load_data.py +++ b/GraphGeneration/scripts/load_data.py @@ -10,7 +10,7 @@ from utils.loader import Loader -def load_data(dataset, strategy, embedding, mlpEncoding, embedOld, trainingStyle, embeddingType): +def load_data(dataset, embedding, mlpEncoding, embeddingType): my_loader = Loader() output_dir = os.path.abspath(f'data/input/cached/{dataset}') @@ -19,14 +19,14 @@ def load_data(dataset, strategy, embedding, mlpEncoding, embedOld, trainingStyle cached_data_dataset_folder = os.path.join(output_dir, 'saved_data/') # Construct output evaluation csv - structure_pred_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/structure_pred.csv' - structure_true_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/structure_true.csv' - structure_diff_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/structure_diff.csv' - kernel_pred_file_path = f'GraphGeneration/output/results/kernel/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/kernel_pred.csv' - kernel_true_file_path = f'GraphGeneration/output/results/kernel/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/kernel_true.csv' - edge_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/edge_analysis.csv' - topER_file_path = f'GraphGeneration/output/results/topER/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/toper_diff.csv' - animation_path = f'GraphGeneration/output/results/animations/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/pred_vs_true.mp4' + structure_pred_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/structure_pred.csv' + structure_true_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/structure_true.csv' + structure_diff_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/structure_diff.csv' + kernel_pred_file_path = f'GraphGeneration/output/results/kernel/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/kernel_pred.csv' + kernel_true_file_path = f'GraphGeneration/output/results/kernel/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/kernel_true.csv' + edge_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/edge_analysis.csv' + topER_file_path = f'GraphGeneration/output/results/topER/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/toper_diff.csv' + animation_path = f'GraphGeneration/output/results/animations/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/pred_vs_true.mp4' # Create file paths if needed for path in [structure_pred_file_path, structure_true_file_path, structure_diff_file_path, kernel_pred_file_path, diff --git a/GraphGeneration/scripts/topoGED_end_to_end.py b/GraphGeneration/scripts/topoGED_end_to_end.py index 30c81f5..7e0bea7 100644 --- a/GraphGeneration/scripts/topoGED_end_to_end.py +++ b/GraphGeneration/scripts/topoGED_end_to_end.py @@ -14,7 +14,6 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) from GraphGeneration.utils.Evaluator import Evaluator -from GraphGeneration.models.temporal_gnn.script.config import args from load_data import load_data, generate_training_data_cached, generate_validation_data_cached from GraphGeneration.utils.casting_type import to_tensor from GraphGeneration.utils.sampling_edges_utils import predict_edges @@ -63,11 +62,11 @@ def __init__(self): # Some default file path self.file_visualization_path = "GraphGeneration/scripts/Visualize" - self.saved_input = os.path.abspath(f'data/input/cached/{args.dataset}/saved_data') - common_suffix = f"multiMLP_{args.strategy}_embedding{args.embedding}_mlpEncoding{args.mlpEncoding}_embeddingType{args.embeddingType}" - self.structure_dir = f"GraphGeneration/output/results/structure/{args.dataset}/{common_suffix}" - self.kernel_dir = f"GraphGeneration/output/results/kernel/{args.dataset}/{common_suffix}" - self.topER_dir = f"GraphGeneration/output/results/topER/{args.dataset}/{common_suffix}" + self.saved_input = os.path.abspath(f'data/input/cached/{encoder_config["dataset"]}/saved_data') + common_suffix = f'topoGED_embedding{encoder_config["encoder_model"]["addOnFeature"]}_mlpEncoding{encoder_config["decoder_model"]["encode_links"]}_embeddingType{encoder_config["encoder_model"]["nodeEmbeddingType"]}' + self.structure_dir = f'GraphGeneration/output/results/structure/{encoder_config["dataset"]}/{common_suffix}' + self.kernel_dir = f'GraphGeneration/output/results/kernel/{encoder_config["dataset"]}/{common_suffix}' + self.topER_dir = f'GraphGeneration/output/results/topER/{encoder_config["dataset"]}/{common_suffix}' # Current target snapshot we want to predict self.current_target_snapshot = 2 @@ -76,13 +75,19 @@ def __init__(self): self.all_edge_types = ['o-o-bank', 'o-o-nobank', 'o-n', 'n-n'] # Load the global encoder & decoder model - self.encoder_model, input_dim = load_encoder_model(args, device=device, node2vec_dimensions=encoder_config["model"]["node2vec_setup"]["node2vec_dimensions"], - hidden_dim=encoder_config["model"]["hidden_dim"]) - self.link_prediction_decoder = setupMLP(embedding_dim=input_dim*2, embedding=args.embedding, mlpEncoding=args.mlpEncoding, embedOld=args.embedOld) + self.encoder_model, input_dim = load_encoder_model(encoder_config, device=device, node2vec_dimensions=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"], + hidden_dim=encoder_config["encoder_model"]["hidden_dim"]) + + # Check if there is any add-on features we will plug at the end of encoder embedding + if encoder_config["encoder_model"]["addOnFeature"] in ['NodeType', 'Position']: + input_dim += 1 + + self.link_prediction_decoder = setupMLP(embedding_dim=input_dim*2, mlpEncoding=encoder_config["decoder_model"]["encode_links"]) self.link_prediction_decoder.to(device) # Load all the snapshot true data - self.probabilities, self.graph_descriptions, self.thresholds, self.target_graphs = load_data(args.dataset, args.strategy, args.embedding, args.mlpEncoding, args.embedOld, args.trainingStyle, args.embeddingType) + self.probabilities, self.graph_descriptions, self.thresholds, self.target_graphs = load_data(encoder_config["dataset"], encoder_config["encoder_model"]["addOnFeature"], + encoder_config["decoder_model"]["encode_links"], encoder_config["encoder_model"]["nodeEmbeddingType"]) # Modify the graph ids to 1,2,3,... self.target_graphs, _ = modifyGraphIds(self.target_graphs, self.thresholds) @@ -106,7 +111,7 @@ def __init__(self): self.test_graphs = [self.target_graphs[i][-1] for i in range(self.val_end, self.num_snapshots)] # ======================= TRAIN MODEL ======================= - def train_multi_head(self, training_samples, validation_samples, epochs=250, batch_size=64, training_new_edges_count=0): + def train_multi_head(self, training_samples, validation_samples, training_new_edges_count=0): """ Train a MultiHeaded MLP Neural Network for use in edge predictions @@ -114,19 +119,17 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat model (MultiheadedMLP): The Multiheaded MLP to train now training_samples: The dictionary store the pos, neg edges of each snapshot, using for training validation_samples: The dictionary store the pos, neg edges of each snapshot, using for validation - epochs (int): The number of epochs to train for - batch_size (int): The batch size to use for the training data Returns: link_prediction_decoder (Multiheaded MLP): The trained MLP """ - lr = args.lr + lr = encoder_config["training"]["lr"] self.link_prediction_decoder.train() optimizer = torch.optim.Adam(list(self.encoder_model.parameters()) + list(self.link_prediction_decoder.parameters()), lr=lr) loss_fn = nn.BCELoss() graphlet_loss_fn = GraphletLoss() # Train - for epoch in range(epochs): + for epoch in range(encoder_config["training"]["epochs"]): train_loss = { 'o-o-bank': [], 'o-o-nobank': [], @@ -179,14 +182,14 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat X_train_curr, curr_y_train = shuffle(curr_X_train, curr_y_train, random_state=self.seed) temp_X_train = torch.tensor(X_train_curr, dtype=torch.float32).to(device) temp_y_train = torch.tensor(curr_y_train, dtype=torch.float32).to(device) - train_loader = DataLoader(TensorDataset(temp_X_train, temp_y_train), batch_size=batch_size, shuffle=True) + train_loader = DataLoader(TensorDataset(temp_X_train, temp_y_train), batch_size=encoder_config["training"]["batch_size"], shuffle=True) # Training graphs for predicting current snapshot training_graphs = self.training_graphs[:snapshot] for (x, y) in train_loader: optimizer.zero_grad() - node_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=training_graphs, encoder_model=self.encoder_model, device=device) + node_embeddings = compute_embedding(embeddingType=encoder_config["encoder_model"]["nodeEmbeddingType"], graphs=training_graphs, encoder_model=self.encoder_model, device=device) # Get current embeddings src_nodes = [int(n) for n in x[:, 0].tolist()] @@ -253,7 +256,7 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat train_labels.extend(y.detach().cpu().numpy()) # Constructing temp graph - curr_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=training_graphs, encoder_model=self.encoder_model, device=device) + curr_embeddings = compute_embedding(embeddingType=encoder_config["encoder_model"]["nodeEmbeddingType"], graphs=training_graphs, encoder_model=self.encoder_model, device=device) constructing_graph = get_node_features(constructing_graph.copy(), self.training_graphs[:snapshot], self.thresholds, self.graph_descriptions[snapshot], node_types["old_nodes"], node_types["new_nodes"]) sampled_edges = predict_edges(constructing_graph, edge_type=flag, node_types=node_types, edgebank=self.all_edgebanks[snapshot], link_prediction_decoder=self.link_prediction_decoder, old_node_embeddings=curr_embeddings, top_k=self.current_target_count[flag], graph_num=snapshot, device=device) @@ -275,7 +278,7 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat if (epoch + 1) % 100 == 0 or epoch == 0: epochMessage = f"Epoch {epoch+1:02d} | Edge Type: {flag} | Train Loss: {np.mean(train_loss[flag]):.4f} | Train AUCROC {np.mean(train_auc[flag]):.4f}" print(epochMessage) - with open(rf"{self.file_visualization_path}\{args.dataset}\{args.embeddingType}\multiheadMLP_performance.txt", "a") as f: + with open(rf'{self.file_visualization_path}\{encoder_config["dataset"]}\{encoder_config["encoder_model"]["nodeEmbeddingType"]}\multiheadMLP_performance.txt', "a") as f: f.write(epochMessage + "\n") @@ -298,21 +301,21 @@ def train_models(self): # Prepare training data training_sorted_samples, training_new_edges_count = generate_training_data_cached(training_graphs=self.training_graphs, - all_edgebanks=self.all_edgebanks, MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=self.seed, saved_data_file_path=self.saved_input) + all_edgebanks=self.all_edgebanks, MAX_SAMPLES=MAX_SAMPLES, dataset=encoder_config["dataset"], seed=self.seed, saved_data_file_path=self.saved_input) # Prepare validation data # We pass all_edgebanks of the training snapshots edgebanks validation_sorted_samples, training_new_edges_count = generate_validation_data_cached(training_graphs=self.validation_graphs, old_training_nodes=old_training_nodes, - all_edgebanks=self.all_edgebanks[self.train_end], MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=self.seed, type_data="validation", saved_data_file_path=self.saved_input) + all_edgebanks=self.all_edgebanks[self.train_end], MAX_SAMPLES=MAX_SAMPLES, dataset=encoder_config["dataset"], seed=self.seed, type_data="validation", saved_data_file_path=self.saved_input) # Prepare test data # We pass all_edgebanks of the training snapshots edgebanks test_sorted_samples, training_new_edges_count = generate_validation_data_cached(training_graphs=self.test_graphs, old_training_nodes=old_training_nodes, - all_edgebanks=self.all_edgebanks[self.val_end], MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=self.seed, type_data="test", saved_data_file_path=self.saved_input) + all_edgebanks=self.all_edgebanks[self.val_end], MAX_SAMPLES=MAX_SAMPLES, dataset=encoder_config["dataset"], seed=self.seed, type_data="test", saved_data_file_path=self.saved_input) print('Training') self.link_prediction_decoder = self.train_multi_head(training_samples=training_sorted_samples, validation_samples=validation_sorted_samples, - epochs=500, batch_size=64, training_new_edges_count=training_new_edges_count) + training_new_edges_count=training_new_edges_count) return self.link_prediction_decoder @@ -376,7 +379,7 @@ def build_accumulating_filtration_sequence_with_edgebank(self, current_target_sn constructing_graph = get_node_features(constructing_graph, prev_graphs, self.thresholds, current_target_graph_description, old_nodes, new_nodes) # Assign embeddings for all the training_nodes - curr_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=prev_graphs, encoder_model=self.encoder_model, device=device) + curr_embeddings = compute_embedding(embeddingType=encoder_config["encoder_model"]["nodeEmbeddingType"], graphs=prev_graphs, encoder_model=self.encoder_model, device=device) # Assign zero vector for new nodes for new_node in new_nodes: @@ -393,7 +396,7 @@ def build_accumulating_filtration_sequence_with_edgebank(self, current_target_sn constructing_graph.add_edges_from(sampled_edges) update_degrees(constructing_graph) - new_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=prev_graphs + [constructing_graph], encoder_model=self.encoder_model, device=device) + new_embeddings = compute_embedding(embeddingType=encoder_config["encoder_model"]["nodeEmbeddingType"], graphs=prev_graphs + [constructing_graph], encoder_model=self.encoder_model, device=device) curr_embeddings.update(new_embeddings) # Recompute old node embeddings edge_pool = edge_pool + sampled_edges @@ -437,7 +440,7 @@ def evaluate(self, pred_graph, true_graph, node_types): on_kl_divergence_results = self.evaluator.kl_divergence_graphs(pred_on_graph, true_on_graph, mode="total") - with open(rf"{self.file_visualization_path}\{args.dataset}\{args.embeddingType}\kl_results_on.txt", "a") as f: + with open(rf'{self.file_visualization_path}\{encoder_config["dataset"]}\{encoder_config["encoder_model"]["nodeEmbeddingType"]}\kl_results_on.txt', "a") as f: f.write(f"{self.current_target_snapshot + 1}, {on_kl_divergence_results:.6f}\n") # Evaluate the graph of n-n @@ -446,7 +449,7 @@ def evaluate(self, pred_graph, true_graph, node_types): nn_kl_divergence_results = self.evaluator.kl_divergence_graphs(pred_nn_graph, true_nn_graph, mode="total") - with open(rf"{self.file_visualization_path}\{args.dataset}\{args.embeddingType}\kl_results_nn.txt", "a") as f: + with open(rf'{self.file_visualization_path}\{encoder_config["dataset"]}\{encoder_config["encoder_model"]["nodeEmbeddingType"]}\kl_results_nn.txt', "a") as f: f.write(f"{self.current_target_snapshot + 1}, {nn_kl_divergence_results:.6f}\n") # Evaluate the graph of old nodes @@ -465,8 +468,8 @@ def evaluate(self, pred_graph, true_graph, node_types): pd.DataFrame([true_kernel]).to_csv(f"{self.kernel_dir}/kernel_true.csv", mode='a', header=False, index=False) def run(self): - print("INFO: Dataset: {}".format(args.dataset)) - encoder_model_path = os.path.join(self.saved_input, rf"saved_models/encoder_{args.embeddingType}_{self.seed}") + print("INFO: Dataset: {}".format(encoder_config["dataset"])) + encoder_model_path = os.path.join(self.saved_input, rf'saved_models/encoder_{encoder_config["encoder_model"]["nodeEmbeddingType"]}_{self.seed}') decoder_model_path = os.path.join(self.saved_input, rf"saved_data/decoder_MLP_{self.seed}") if os.path.exists(encoder_model_path) and os.path.exists(decoder_model_path): @@ -528,4 +531,4 @@ def run(self): runner.run() # To run the script -# python GraphGeneration/scripts/topoGED_end_to_end.py --embeddingType=LSTM --dataset=CollegeMsg --nfeat=64 \ No newline at end of file +# python GraphGeneration/scripts/topoGED_end_to_end.py --embeddingType=LSTM \ No newline at end of file diff --git a/GraphGeneration/utils/graph_construction_utils.py b/GraphGeneration/utils/graph_construction_utils.py index 8c6513a..de53d54 100644 --- a/GraphGeneration/utils/graph_construction_utils.py +++ b/GraphGeneration/utils/graph_construction_utils.py @@ -131,7 +131,6 @@ def update_degrees(graph: nx.DiGraph): if 'feat' not in graph.nodes[node]: graph.nodes[node]['feat'] = {'id': node} graph.nodes[node]['feat']['currDegree'] = 0 - graph.nodes[node]['feat']['maxDegree'] = assigned_degree else: graph.nodes[node]['feat']['currDegree'] = graph.degree(node) \ No newline at end of file From ca05d64dcfe256bd399ddaa9dff7afb1c8dfc02e Mon Sep 17 00:00:00 2001 From: Duy Kha Date: Thu, 31 Jul 2025 13:46:55 -0500 Subject: [PATCH 4/8] remove args --- GraphGeneration/models/temporal_gnn/script/config.py | 8 -------- GraphGeneration/scripts/topoGED_end_to_end.py | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/GraphGeneration/models/temporal_gnn/script/config.py b/GraphGeneration/models/temporal_gnn/script/config.py index 174e3a0..8bb01f6 100644 --- a/GraphGeneration/models/temporal_gnn/script/config.py +++ b/GraphGeneration/models/temporal_gnn/script/config.py @@ -50,14 +50,6 @@ parser.add_argument('--fixed_curvature', type=int, default=1, help='fixed (1) curvature or not (0)') parser.add_argument('--aggregation', type=str, default='deg', help='aggregation method: [deg, att]') -parser.add_argument("--strategy", type=str, required=False, default='MultiheadedMLP', choices=['MultiheadedMLP', 'SingleMLP'], help="The type of MLP NN to use") -parser.add_argument("--embedding", type=str, required=False, default='Position', choices=['Position', 'NodeType', 'Position+NodeType', 'None'], help="Allows appending positional encodings or an integer node type onto the end of the embeddings") -parser.add_argument("--mlpEncoding", type=str, required=False, default='Concat', choices=['Concat', 'Product', 'Addition', 'Subtraction'], help="How you want to input node embeddings to the MLP") # Product and addition lead to potential noise as we use directed graphs -parser.add_argument("--embedOld", type=str, required=False, default='True', choices=['True', 'False'], help="If you want to let the MLP predict edge type \'o-o-bank\', otherwise these edges are randomly added") -parser.add_argument("--oldDegree", type=str, required=False, default='False' ,choices=['True', 'False'], help="If you want reappearing nodes to reuse their most recent degree") -parser.add_argument("--trainingStyle", type=str, required=False, default='TrueGraphs', choices=['TrueGraphs', 'PredGraphs', 'MixedGraphs'], help="When training the MLP, decides if you use real graphs, predicted graphs (with first real as starter), or real then pred for MLP training") -parser.add_argument("--embeddingType", type=str, required=False, default='Node2Vec', choices=['Linear', 'Node2Vec', 'LSTM', 'GCLSTM', 'HTGN'], help="How nodes should be embedded. Either with Node2Vec or with a Linear mutliplication of adjacency matrix by node feature matrix") - # TopoGED mode parser.add_argument('--use_predict_probs', action='store_true', help='Use prediction probabilities to predict the next snapshot') parser.add_argument('--use_predict_graph_prediction', action='store_true', help='Use prediction graph description to predict the next snapshot') diff --git a/GraphGeneration/scripts/topoGED_end_to_end.py b/GraphGeneration/scripts/topoGED_end_to_end.py index 7e0bea7..6967473 100644 --- a/GraphGeneration/scripts/topoGED_end_to_end.py +++ b/GraphGeneration/scripts/topoGED_end_to_end.py @@ -531,4 +531,4 @@ def run(self): runner.run() # To run the script -# python GraphGeneration/scripts/topoGED_end_to_end.py --embeddingType=LSTM \ No newline at end of file +# python GraphGeneration/scripts/topoGED_end_to_end.py \ No newline at end of file From fdc405279cb9456a88c9f7f285ba220863e9df8b Mon Sep 17 00:00:00 2001 From: Duy Kha Date: Fri, 1 Aug 2025 21:42:51 -0500 Subject: [PATCH 5/8] comments --- GraphGeneration/encoder.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GraphGeneration/encoder.yaml b/GraphGeneration/encoder.yaml index 9174ae8..53ab881 100644 --- a/GraphGeneration/encoder.yaml +++ b/GraphGeneration/encoder.yaml @@ -18,7 +18,7 @@ decoder_model: encode_links: "Concat" training: - batch_size: 64 + batch_size: 64 # play around lr: 0.001 epochs: 500 From 6bf6325a6756d394b0dbe61d4a2e392d6bb86a84 Mon Sep 17 00:00:00 2001 From: Duy Kha Date: Mon, 4 Aug 2025 00:16:59 -0500 Subject: [PATCH 6/8] draft --- GraphGeneration/scripts/load_data.py | 12 +- GraphGeneration/scripts/topoGED_end_to_end.py | 184 ++++++++++++++++-- 2 files changed, 176 insertions(+), 20 deletions(-) diff --git a/GraphGeneration/scripts/load_data.py b/GraphGeneration/scripts/load_data.py index 5bffaea..1a86e55 100644 --- a/GraphGeneration/scripts/load_data.py +++ b/GraphGeneration/scripts/load_data.py @@ -282,10 +282,14 @@ def generate_validation_data(training_graphs, old_training_nodes, all_edgebanks, print(f"[FATAL] Unexpected failure at outer loop for edge ({u}, {v}): {type(e).__name__} - {e}") # Generate an equal amount of negative labels for each type of edge - negative_edges_oo = generate_negative_edges(graph, new_edges_count['o-o-bank'], edge_type='o-o-bank', edgebank=all_edgebanks, old_nodes=old_training_nodes) - negative_edges_oon = generate_negative_edges(graph, new_edges_count['o-o-nobank'], edge_type='o-o-nobank', edgebank=all_edgebanks, old_nodes=old_training_nodes) - negative_edges_on = generate_negative_edges(graph, new_edges_count['o-n'], edge_type='o-n', edgebank=all_edgebanks, old_nodes=old_training_nodes) - negative_edges_nn = generate_negative_edges(graph, new_edges_count['n-n'], edge_type='n-n', edgebank=all_edgebanks, old_nodes=old_training_nodes) + negative_edges_oo = generate_negative_edges(graph, new_edges_count['o-o-bank'], edge_type='o-o-bank', + edgebank=all_edgebanks, old_nodes=old_training_nodes) + negative_edges_oon = generate_negative_edges(graph, new_edges_count['o-o-nobank'], edge_type='o-o-nobank', + edgebank=all_edgebanks, old_nodes=old_training_nodes) + negative_edges_on = generate_negative_edges(graph, new_edges_count['o-n'], edge_type='o-n', + edgebank=all_edgebanks, old_nodes=old_training_nodes) + negative_edges_nn = generate_negative_edges(graph, new_edges_count['n-n'], edge_type='n-n', + edgebank=all_edgebanks, old_nodes=old_training_nodes) tmp_samples_oo = [torch.tensor([u, v]) for u, v in negative_edges_oo] tmp_samples_oon = [torch.tensor([u, v]) for u, v in negative_edges_oon] diff --git a/GraphGeneration/scripts/topoGED_end_to_end.py b/GraphGeneration/scripts/topoGED_end_to_end.py index 8e7941d..afdc99a 100644 --- a/GraphGeneration/scripts/topoGED_end_to_end.py +++ b/GraphGeneration/scripts/topoGED_end_to_end.py @@ -24,7 +24,7 @@ from GraphGeneration.models.model import setupMLP, load_encoder_model # Import all node embedding methods -from compute_embedding import compute_embedding, node2vec_batch_words +from compute_embedding import compute_embedding from process_data import modifyGraphIds, build_edgebanks_from_start from torch.utils.data import DataLoader @@ -51,6 +51,7 @@ class Runner(object): def __init__(self): self.seed = global_seed + self.best_validation_model_auc = 0 # Set up Evaluator self.evaluator = Evaluator() @@ -75,6 +76,7 @@ def __init__(self): # Check if there is any add-on features we will plug at the end of encoder embedding if args.embedding in ['NodeType', 'Position']: input_dim += 1 + args.nfeat += 1 self.link_prediction_decoder = setupMLP(embedding_dim=input_dim*2, embedding=args.embedding, mlpEncoding=args.mlpEncoding, embedOld=args.embedOld) self.link_prediction_decoder.to(device) @@ -96,14 +98,155 @@ def __init__(self): # Convert number of snapshots to integer self.num_snapshots = len(self.probabilities) self.train_end = int(0.8 * self.num_snapshots) - val_end = int(0.9 * self.num_snapshots) + self.val_end = int(0.9 * self.num_snapshots) # Assign snapshots self.training_graphs = [self.target_graphs[i][-1] for i in range(self.train_end)] - self.validation_graphs = [self.target_graphs[i][-1] for i in range(self.train_end, val_end)] - self.test_graphs = [self.target_graphs[i][-1] for i in range(val_end, self.num_snapshots)] + self.validation_graphs = [self.target_graphs[i][-1] for i in range(self.train_end, self.val_end)] + self.test_graphs = [self.target_graphs[i][-1] for i in range(self.val_end, self.num_snapshots)] # ======================= TRAIN MODEL ======================= + def run_validation(self, validation_samples, batch_size, epoch): + train_auc = { + 'o-o-bank': [], + 'o-o-nobank': [], + 'o-n': [], + 'n-n': [], + } + # For computing AUC Scores + train_preds = [] + train_labels = [] + + for i in range(1): + snapshot = i + len(self.training_graphs) + 1 + self.encoder_model.eval() + self.link_prediction_decoder.eval() + with torch.no_grad(): + print("INFO: Validation on snapshot", snapshot) + + node_types = { + "old_nodes": set().union(*(graph.nodes() for graph in self.training_graphs)), + "new_nodes": set() + } + + # Prepare current target graph count + self.current_target_count_old_nodes = self.probabilities[snapshot][0] + self.current_target_count_new_nodes = self.probabilities[snapshot][1] + self.current_target_count = { + edge_type: self.probabilities[snapshot][j + 2] + for j, edge_type in enumerate(self.all_edge_types) + } + + constructing_graph = nx.DiGraph() # Graph we try to predict + + # Adding old nodes to constructing_graph + constructing_graph.add_nodes_from(node_types['old_nodes']) + + for flag in self.all_edge_types: + curr_X_train = validation_samples[flag]['X'][i] + curr_y_train = validation_samples[flag]['y'][i] + + if len(curr_X_train) == 0 or len(curr_y_train) == 0: + print(f'No samples for edge type: {flag}') + continue + + curr_X_train = [x.cpu().detach().numpy() if torch.is_tensor(x) else x for x in curr_X_train] + curr_X_train = np.array(curr_X_train) + curr_y_train = np.array(curr_y_train) + + X_train_curr, curr_y_train = shuffle(curr_X_train, curr_y_train, random_state=self.seed) + temp_X_train = torch.tensor(X_train_curr, dtype=torch.float32).to(device) + temp_y_train = torch.tensor(curr_y_train, dtype=torch.float32).to(device) + train_loader = DataLoader(TensorDataset(temp_X_train, temp_y_train), batch_size=batch_size, shuffle=True) + + # Training graphs for predicting current snapshot + validation_graphs = self.training_graphs + + for (x, y) in train_loader: + node_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=validation_graphs, encoder_model=self.encoder_model, device=device) + + # Get current embeddings + src_nodes = [int(n) for n in x[:, 0].tolist()] + dst_nodes = [int(n) for n in x[:, 1].tolist()] + + # Add new nodes to the node_types + for n in src_nodes: + if n not in node_embeddings and flag in ['o-n', 'n-n']: + node_types["new_nodes"].add(n) + constructing_graph.add_node(n) + node_embeddings[n] = torch.zeros(args.nfeat, device=device) + + for n in dst_nodes: + if n not in node_embeddings and flag in ['o-n', 'n-n']: + node_types["new_nodes"].add(n) + constructing_graph.add_node(n) + node_embeddings[n] = torch.zeros(args.nfeat, device=device) + + src_embed = torch.stack([ + node_embeddings[n] for n in src_nodes + ]) + + dst_embed = torch.stack([ + node_embeddings[n] for n in dst_nodes + ]) + + if src_embed.dim() == 1: + src_embed = src_embed.unsqueeze(1) + if dst_embed.dim() == 1: + dst_embed = dst_embed.unsqueeze(1) + + preds = self.link_prediction_decoder(src_embed=src_embed, dst_embed=dst_embed, edge_type=flag) + + if preds.dim() == 0: + preds = preds.unsqueeze(0) + if y.dim() == 0: # scalar value like torch.tensor(0.5) + y = y.unsqueeze(0) # make it [1] + elif y.dim() == 2 and y.size(1) == 1: # shape [batch_size, 1] + y = y.view(-1) + + # Add to our labels for evaluation + train_preds.extend(preds.detach().cpu().numpy()) + train_labels.extend(y.detach().cpu().numpy()) + + # Assign embeddings for all the training_nodes + curr_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=validation_graphs, encoder_model=self.encoder_model, device=device) + constructing_graph = get_node_features(constructing_graph.copy(), self.training_graphs, self.thresholds, self.graph_descriptions[snapshot], node_types["old_nodes"], node_types["new_nodes"]) + sampled_edges = predict_edges(constructing_graph, edge_type=flag, node_types=node_types, edgebank=self.all_edgebanks[snapshot], link_prediction_decoder=self.link_prediction_decoder, + old_node_embeddings=curr_embeddings, top_k=self.current_target_count[flag], graph_num=snapshot, device=device) + constructing_graph.add_edges_from(list(sampled_edges)) + update_degrees(constructing_graph) + + # Update the training_graphs to involve with the constructing graph + if flag == 'o-o-nobank': + validation_graphs.append(constructing_graph) + else: + validation_graphs[-1] = constructing_graph + + if len(np.unique(train_labels)) < 2: + train_auc.append(0) + else: + train_auc[flag].append(roc_auc_score(train_labels, train_preds)) # Calculate scores + + # Record the Training Loss, AUC + current_model_auc = 0 #we take average of all edge types + + for flag in self.all_edge_types: + epochMessage = f"Epoch {epoch+1:02d} | Edge Type: {flag} | Validation AUCROC {np.mean(train_auc[flag]):.4f}" + current_model_auc += np.mean(train_auc[flag]) + print(epochMessage) + with open(rf"{self.file_visualization_path}\{args.dataset}\{args.embeddingType}\multiheadMLP_performance.txt", "a") as f: + f.write(epochMessage + "\n") + + # We check and cache if it has the best auc + if current_model_auc/4 >= self.best_validation_model_auc: + self.best_validation_model_auc = current_model_auc + + print("INFO: Saving the model...") + torch.save(self.link_prediction_decoder.state_dict(), self.model_path) + torch.save(self.encoder_model.state_dict(), self.model_path) + print("INFO: The model is saved. Done.") + + def train_multi_head(self, training_samples, validation_samples, epochs=250, batch_size=64, training_new_edges_count=0): """ Train a MultiHeaded MLP Neural Network for use in edge predictions @@ -141,11 +284,11 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat train_preds = [] train_labels = [] - for snapshot in range(2, len(self.training_graphs)): + for snapshot in range(2, 3): print("INFO: Training on snapshot", snapshot) node_types = { - "old_nodes": set().union(*(graph.nodes() for graph in self.training_graphs[:snapshot])), + "old_nodes": set().union(*(graph.nodes() for graph in self.training_graphs[max(0, snapshot - 5):snapshot])), "new_nodes": set() } @@ -180,7 +323,7 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat train_loader = DataLoader(TensorDataset(temp_X_train, temp_y_train), batch_size=batch_size, shuffle=True) # Training graphs for predicting current snapshot - training_graphs = self.training_graphs[:snapshot] + training_graphs = self.training_graphs[max(0, snapshot - 5):snapshot] for (x, y) in train_loader: optimizer.zero_grad() @@ -235,7 +378,6 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat graphlet_loss = graphlet_loss_fn(to_tensor(pred_kernel, device=device).unsqueeze(0), to_tensor(true_kernel, device=device).unsqueeze(0)) loss = 0.5*loss_fn(preds, y) + 0.5*graphlet_loss - # loss = loss_fn(preds, y) loss.backward() optimizer.step() train_loss[flag].append(loss.item()) @@ -253,7 +395,7 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat update_degrees(constructing_graph) # Update the training_graphs to involve with the constructing graph - if len(training_graphs) == snapshot: + if flag == 'o-o-nobank': training_graphs.append(constructing_graph) else: training_graphs[-1] = constructing_graph @@ -263,6 +405,10 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat else: train_auc[flag].append(roc_auc_score(train_labels, train_preds)) # Calculate scores + # Validation + self.run_validation(validation_samples=validation_samples, batch_size=batch_size, epoch=epoch) + + # Record the Training Loss, AUC for flag in self.all_edge_types: if (epoch + 1) % 100 == 0 or epoch == 0: epochMessage = f"Epoch {epoch+1:02d} | Edge Type: {flag} | Train Loss: {np.mean(train_loss[flag]):.4f} | Train AUCROC {np.mean(train_auc[flag]):.4f}" @@ -271,7 +417,7 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat f.write(epochMessage + "\n") - return self.link_prediction_decoder + return self.link_prediction_decoder, self.encoder_model def train_models(self): """ @@ -290,23 +436,29 @@ def train_models(self): # Prepare training data training_sorted_samples, training_new_edges_count = generate_training_data_cached(training_graphs=self.training_graphs, - all_edgebanks=self.all_edgebanks, MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=global_seed, saved_data_file_path=self.saved_input) + all_edgebanks=self.all_edgebanks, MAX_SAMPLES=MAX_SAMPLES, + dataset=args.dataset, seed=global_seed, + saved_data_file_path=self.saved_input) # Prepare validation data # We pass all_edgebanks of the training snapshots edgebanks validation_sorted_samples, training_new_edges_count = generate_validation_data_cached(training_graphs=self.validation_graphs, old_training_nodes=old_training_nodes, - all_edgebanks=self.all_edgebanks[self.train_end], MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=global_seed, type_data="validation", saved_data_file_path=self.saved_input) + all_edgebanks=self.all_edgebanks[self.train_end], MAX_SAMPLES=MAX_SAMPLES, + dataset=args.dataset, seed=global_seed, + type_data="validation", saved_data_file_path=self.saved_input) # Prepare test data # We pass all_edgebanks of the training snapshots edgebanks test_sorted_samples, training_new_edges_count = generate_validation_data_cached(training_graphs=self.test_graphs, old_training_nodes=old_training_nodes, - all_edgebanks=self.all_edgebanks[self.train_end], MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=global_seed, type_data="test", saved_data_file_path=self.saved_input) + all_edgebanks=self.all_edgebanks[self.val_end], MAX_SAMPLES=MAX_SAMPLES, + dataset=args.dataset, seed=global_seed, + type_data="test", saved_data_file_path=self.saved_input) print('Training') self.link_prediction_decoder = self.train_multi_head(training_samples=training_sorted_samples, validation_samples=validation_sorted_samples, epochs=500, batch_size=64, training_new_edges_count=training_new_edges_count) - return self.link_prediction_decoder + return self.link_prediction_decoder, self.encoder_model # ======================= BUILD GRAPH ======================= def build_accumulating_filtration_sequence_with_edgebank(self, current_target_snapshot): @@ -330,7 +482,7 @@ def build_accumulating_filtration_sequence_with_edgebank(self, current_target_sn # Get the edgebank up to the current target snapshot edgebank = self.all_edgebanks[current_target_snapshot] current_target_graph_description = self.graph_descriptions[current_target_snapshot] - prev_graphs = [graph[-1] for graph in self.target_graphs[:current_target_snapshot]] + prev_graphs = [graph[-1] for graph in self.target_graphs[max(0, current_target_snapshot - 5):current_target_snapshot]] V_total = int(current_target_graph_description[-1][0]) E_total = int(current_target_graph_description[-1][1]) @@ -472,7 +624,7 @@ def run(self): else: # Train the Decoder and Encoder model print('Training the Link Prediction Decoder and Encoder') - self.link_prediction_decoder = self.train_models() + self.link_prediction_decoder, self.encoder_model = self.train_models() print('Finished training the Link Prediction Decoder and Encoder; Start Graph Construction') # saving the trained model From a425fc932a89ccaa7fc3bce7dd403e7c4a959ee4 Mon Sep 17 00:00:00 2001 From: Duy Kha Date: Tue, 5 Aug 2025 15:58:01 -0500 Subject: [PATCH 7/8] rewrite loading model message + add gpu memory alloc --- GraphGeneration/scripts/topoGED_end_to_end.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/GraphGeneration/scripts/topoGED_end_to_end.py b/GraphGeneration/scripts/topoGED_end_to_end.py index 0cbacf9..5365140 100644 --- a/GraphGeneration/scripts/topoGED_end_to_end.py +++ b/GraphGeneration/scripts/topoGED_end_to_end.py @@ -255,7 +255,8 @@ def run_validation(self, validation_samples, batch_size, epoch): current_model_auc = 0 #we take average of all edge types for flag in self.all_edge_types: - epochMessage = f"Epoch {epoch+1:02d} | Edge Type: {flag} | Validation AUCROC {np.mean(train_auc[flag]):.4f}" + gpu_mem_alloc = torch.cuda.max_memory_allocated() / 1000000 if torch.cuda.is_available() else 0 + epochMessage = f"Epoch {epoch+1:02d} | Edge Type: {flag} | Validation AUCROC {np.mean(train_auc[flag]):.4f} | GPU: {gpu_mem_alloc:.1f}MiB" current_model_auc += np.mean(train_auc[flag]) print(epochMessage) with open(rf'{self.file_visualization_path}\{encoder_config["dataset"]}\{encoder_config["encoder_model"]["nodeEmbeddingType"]}\multiheadMLP_performance.txt', "a") as f: @@ -306,7 +307,7 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat train_preds = [] train_labels = [] - for snapshot in range(2, 15): + for snapshot in range(2, 16): print("INFO: Training on snapshot", snapshot) # Prepare current target graph count @@ -437,9 +438,10 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat self.run_validation(validation_samples=validation_samples, batch_size=encoder_config["training"]["batch_size"], epoch=epoch) # Record the Training Loss, AUC + gpu_mem_alloc = torch.cuda.max_memory_allocated() / 1000000 if torch.cuda.is_available() else 0 for flag in self.all_edge_types: if (epoch + 1) % 20 == 0 or epoch == 0: - epochMessage = f"Epoch {epoch+1:02d} | Edge Type: {flag} | Train Loss: {np.mean(train_loss[flag]):.4f} | Train AUCROC {np.mean(train_auc[flag]):.4f}" + epochMessage = f"Epoch {epoch+1:02d} | Edge Type: {flag} | Train Loss: {np.mean(train_loss[flag]):.4f} | Train AUCROC {np.mean(train_auc[flag]):.4f} | GPU: {gpu_mem_alloc:.1f}MiB" print(epochMessage) with open(rf'{self.file_visualization_path}\{encoder_config["dataset"]}\{encoder_config["encoder_model"]["nodeEmbeddingType"]}\multiheadMLP_performance.txt', "a") as f: f.write(epochMessage + "\n") @@ -644,8 +646,8 @@ def run(self): self.link_prediction_decoder.eval() self.encoder_model.eval() - print(f"✅ Link Prediction Decoder loaded from: {decoder_model_path}") - print(f"✅ Ecoder loaded from: {encoder_model_path}") + print(f"Link Prediction Decoder loaded from: {decoder_model_path}") + print(f"Encoder loaded from: {encoder_model_path}") else: # Train the Decoder and Encoder model print('Training the Link Prediction Decoder and Encoder') From 699dca7e1a936cef14ee5237b4dd779614316efa Mon Sep 17 00:00:00 2001 From: Duy Kha Date: Tue, 5 Aug 2025 15:59:08 -0500 Subject: [PATCH 8/8] remove saving model after train_models --- GraphGeneration/scripts/topoGED_end_to_end.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/GraphGeneration/scripts/topoGED_end_to_end.py b/GraphGeneration/scripts/topoGED_end_to_end.py index 5365140..d3bbedb 100644 --- a/GraphGeneration/scripts/topoGED_end_to_end.py +++ b/GraphGeneration/scripts/topoGED_end_to_end.py @@ -653,12 +653,6 @@ def run(self): print('Training the Link Prediction Decoder and Encoder') self.link_prediction_decoder, self.encoder_model = self.train_models() print('Finished training the Link Prediction Decoder and Encoder; Start Graph Construction') - - # saving the trained model - print("INFO: Saving the model...") - torch.save(self.link_prediction_decoder.state_dict(), self.model_path) - torch.save(self.encoder_model.state_dict(), self.model_path) - print("INFO: The model is saved. Done.") # Old graphs that we know up to now self.old_graphs = [self.target_graphs[0], self.target_graphs[1]]