FDataLab · ComTDK · Jul 30, 2025 · Jul 30, 2025 · Jul 31, 2025 · Jul 31, 2025
diff --git a/GraphGeneration/encoder.yaml b/GraphGeneration/encoder.yaml
@@ -0,0 +1,26 @@
+encoder_model:
+  name: topoGED
+  node2vec_setup:
+    node2vec_dimensions: 64 # We add features onto the end since Node2Vec doesn't embed features
+    node2vec_walk_length: 50 # Number of nodes visited per walk (Higher is more global, smaller is local)
+    node2vec_num_walks: 10 # Number of walks to start per node (Higher is more detailed and stable)
+    node2vec_p: 1.0 # Return parameter, the likelihood of revisiting a node (Higher is less backtracking)
+    node2vec_q: 1.0 # The walk bias for determining direction (Higher is more DFS-like; lower is BFS-like)
+    node2vec_window: 10 # The context size (Higher is broader learning)
+    node2vec_min_count: 1 # Minimum number of occurrences for a node to be considered (Higher will ignore more rare nodes)
+    node2vec_batch_words: 4 # The batch size for when Word2Vec is used (Higher will train faster; but with more memory)
+    node2vec_workers: 1 # Number of workers (threads)
+  hidden_dim: 64
+  nodeEmbeddingType: "LSTM"
+  addOnFeature: "Position"
+
+decoder_model:
+  encode_links: "Concat"
+
+training:
+  batch_size: 64 # play around
+  lr: 0.001
+  epochs: 500
+
+dataset: CollegeMsg
+seed: 1024
diff --git a/GraphGeneration/models/model.py b/GraphGeneration/models/model.py
@@ -1,14 +1,9 @@
 # Models in use
-import os
-import torch
 from GraphGeneration.models.MultiHeadedEdgePredictor import MultiHeadedEdgePredictor
-from GraphGeneration.models.EdgePredictor import EdgePredictorMLP
 from GraphGeneration.models.temporal_gnn.script.utils.util import logger
-from GraphGeneration.models.temporal_gnn.script.models.HTGN import HTGN
-from GraphGeneration.models.GCLSTM import GCLSTM
 from GraphGeneration.models.SimpleNodeLSTM import SimpleNodeLSTM
 
-def setupMLP(embedding_dim, embedding, mlpEncoding, embedOld):   
+def setupMLP(embedding_dim, mlpEncoding):   
     """
     Set up the MLP based on the arguments provided in the command line starter
 
@@ -21,28 +16,15 @@ def setupMLP(embedding_dim, embedding, mlpEncoding, embedOld):
     input_dim = embedding_dim  # Starting input dimension (two 32-dim node embeddings)
 
     # Set up the MLPs according to arguments
-    if embedOld == 'True':
-        flags = ['o-o-bank', 'o-o-nobank', 'o-n', 'n-n']
-    else:
-        flags = ['o-o-nobank', 'o-n', 'n-n']
+    flags = ['o-o-bank', 'o-o-nobank', 'o-n', 'n-n']
     mlp = MultiHeadedEdgePredictor(in_channels=input_dim, hidden_channels=32, edge_types=flags, input_type=mlpEncoding)
 
     return mlp
 
-def load_encoder_model(args, device, node2vec_dimensions, hidden_dim=64, HTGN_nodelist=[]):       
-    if args.embeddingType == 'LSTM':
+def load_encoder_model(encoder_config, device, node2vec_dimensions, hidden_dim=64, HTGN_nodelist=[]):       
+    if encoder_config["encoder_model"]["nodeEmbeddingType"] == 'LSTM':
         model = SimpleNodeLSTM(input_dim=node2vec_dimensions, hidden_dim=hidden_dim).to(device)
-    elif args.embeddingType == 'GCLSTM':
-        model = GCLSTM(in_channels=node2vec_dimensions, hidden_channels=64).to(device)
-        model.device = device
-    elif args.embeddingType == 'HTGN':
-        args.num_nodes = len(HTGN_nodelist)
-        args.nfeat = node2vec_dimensions
-        args.nhid = 64
-        args.nout = 64
-        model = HTGN(args).to(device)
-        model.device = device
     else:
         raise Exception('pls define the model')
-    logger.info('using model {} '.format(args.embeddingType))
+    logger.info('using model {} '.format(encoder_config["encoder_model"]["nodeEmbeddingType"]))
     return model, hidden_dim
diff --git a/GraphGeneration/models/temporal_gnn/script/config.py b/GraphGeneration/models/temporal_gnn/script/config.py
@@ -50,14 +50,6 @@
 parser.add_argument('--fixed_curvature', type=int, default=1, help='fixed (1) curvature or not (0)')
 parser.add_argument('--aggregation', type=str, default='deg', help='aggregation method: [deg, att]')
 
-parser.add_argument("--strategy", type=str, required=False, default='MultiheadedMLP', choices=['MultiheadedMLP', 'SingleMLP'], help="The type of MLP NN to use")
-parser.add_argument("--embedding", type=str, required=False, default='Position', choices=['Position', 'NodeType', 'Position+NodeType', 'None'], help="Allows appending positional encodings or an integer node type onto the end of the embeddings")
-parser.add_argument("--mlpEncoding", type=str, required=False, default='Concat', choices=['Concat', 'Product', 'Addition', 'Subtraction'], help="How you want to input node embeddings to the MLP")  # Product and addition lead to potential noise as we use directed graphs
-parser.add_argument("--embedOld", type=str, required=False, default='True', choices=['True', 'False'], help="If you want to let the MLP predict edge type \'o-o-bank\', otherwise these edges are randomly added")
-parser.add_argument("--oldDegree", type=str, required=False, default='False' ,choices=['True', 'False'], help="If you want reappearing nodes to reuse their most recent degree")
-parser.add_argument("--trainingStyle", type=str, required=False, default='TrueGraphs', choices=['TrueGraphs', 'PredGraphs', 'MixedGraphs'], help="When training the MLP, decides if you use real graphs, predicted graphs (with first real as starter), or real then pred for MLP training")
-parser.add_argument("--embeddingType", type=str, required=False, default='Node2Vec', choices=['Linear', 'Node2Vec', 'LSTM', 'GCLSTM', 'HTGN'], help="How nodes should be embedded. Either with Node2Vec or with a Linear mutliplication of adjacency matrix by node feature matrix")
-
 # TopoGED mode
 parser.add_argument('--use_predict_probs', action='store_true', help='Use prediction probabilities to predict the next snapshot')
 parser.add_argument('--use_predict_graph_prediction', action='store_true', help='Use prediction graph description to predict the next snapshot')

diff --git a/GraphGeneration/scripts/compute_embedding.py b/GraphGeneration/scripts/compute_embedding.py
@@ -5,17 +5,11 @@
 import torch
 from node2vec import Node2Vec
 from GraphGeneration.models.temporal_gnn.script.config import args
+import yaml
 
-# Node2Vec Parameters
-node2vec_dimensions = args.nfeat  # We add features onto the end since Node2Vec doesn't embed features 
-node2vec_walk_length = 50  # Number of nodes visited per walk (Higher is more global, smaller is local)
-node2vec_num_walks = 10  # Number of walks to start per node (Higher is more detailed and stable)
-node2vec_p = 1.0  # Return parameter, the likelihood of revisiting a node (Higher is less backtracking)
-node2vec_q = 1.0  # The walk bias for determining direction (Higher is more DFS-like; lower is BFS-like)
-node2vec_window = 10  # The context size (Higher is broader learning)
-node2vec_min_count = 1  # Minimum number of occurrences for a node to be considered (Higher will ignore more rare nodes)
-node2vec_batch_words = 4  # The batch size for when Word2Vec is used (Higher will train faster; but with more memory)
-node2vec_workers = 1  # Number of workers (threads)
+# Load YAML config
+with open("GraphGeneration/encoder.yaml", "r") as file:
+    encoder_config = yaml.safe_load(file)
 
 def compute_linear_gnn_embeddings(G: nx.DiGraph, device):
     """
@@ -62,19 +56,20 @@ def compute_node2vec_embeddings(G: nx.DiGraph, device):
     """
     node2vec = Node2Vec(
         G,
-        dimensions=node2vec_dimensions,
-        walk_length=node2vec_walk_length,
-        num_walks=node2vec_num_walks,
-        workers=node2vec_workers,
-        p=node2vec_p,
-        q=node2vec_q,
+        dimensions=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"],
+        walk_length=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_walk_length"],
+        num_walks=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_num_walks"],
+        workers=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_workers"],
+        p=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_p"],
+        q=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_q"],
         quiet=True
     )
 
     model = node2vec.fit(
-        window=node2vec_window, 
-        min_count=node2vec_min_count, 
-        batch_words=node2vec_batch_words
+        window=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_window"], 
+        min_count=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_min_count"], 
+        batch_words=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_batch_words"],
+        workers=1 
     )  # Perform Node2Vec
 
     # Used to generate an embedding for isolated nodes
@@ -111,7 +106,8 @@ def compute_node_embeddings_LSTM(graph_snapshots, lstm_model, device):
     # Collect per-timestep node embeddings
     node_history = defaultdict(list)
     old_nodes = set()
-    null_embed = torch.tensor([0]*(node2vec_dimensions), dtype=torch.float32).to(device)
+    null_embed = torch.tensor([0]*(encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"]),
+                              dtype=torch.float32).to(device)
     for G in graph_snapshots:
         snapshot_embeddings = compute_node2vec_embeddings(G, device)
         for node, emb in snapshot_embeddings.items():
@@ -151,7 +147,7 @@ def get_GCN_data(graph_snapshots):
     x_list = []
     edge_index_list = []
 
-    F = node2vec_dimensions # number of features per node (change this if you want more features)
+    F = encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"] # number of features per node (change this if you want more features)
 
     for G in graph_snapshots:
         node2vec_embeddings = compute_node2vec_embeddings(G)

diff --git a/GraphGeneration/scripts/load_data.py b/GraphGeneration/scripts/load_data.py
@@ -10,7 +10,7 @@
 from utils.loader import Loader
 
 
-def load_data(dataset, strategy, embedding, mlpEncoding, embedOld, trainingStyle, embeddingType):
+def load_data(dataset, embedding, mlpEncoding, embeddingType):
     my_loader = Loader()
     output_dir = os.path.abspath(f'data/input/cached/{dataset}')
 
@@ -19,14 +19,14 @@ def load_data(dataset, strategy, embedding, mlpEncoding, embedOld, trainingStyle
     cached_data_dataset_folder = os.path.join(output_dir, 'saved_data/')
 
     # Construct output evaluation csv
-    structure_pred_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/structure_pred.csv'
-    structure_true_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/structure_true.csv'
-    structure_diff_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/structure_diff.csv'
-    kernel_pred_file_path = f'GraphGeneration/output/results/kernel/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/kernel_pred.csv'
-    kernel_true_file_path = f'GraphGeneration/output/results/kernel/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/kernel_true.csv'
-    edge_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/edge_analysis.csv'
-    topER_file_path = f'GraphGeneration/output/results/topER/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/toper_diff.csv'
-    animation_path = f'GraphGeneration/output/results/animations/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/pred_vs_true.mp4'
+    structure_pred_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/structure_pred.csv'
+    structure_true_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/structure_true.csv'
+    structure_diff_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/structure_diff.csv'
+    kernel_pred_file_path = f'GraphGeneration/output/results/kernel/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/kernel_pred.csv'
+    kernel_true_file_path = f'GraphGeneration/output/results/kernel/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/kernel_true.csv'
+    edge_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/edge_analysis.csv'
+    topER_file_path = f'GraphGeneration/output/results/topER/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/toper_diff.csv'
+    animation_path = f'GraphGeneration/output/results/animations/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/pred_vs_true.mp4'
 
     # Create file paths if needed
     for path in [structure_pred_file_path, structure_true_file_path, structure_diff_file_path, kernel_pred_file_path, 
@@ -282,10 +282,14 @@ def generate_validation_data(training_graphs, old_training_nodes, all_edgebanks,
                 print(f"[FATAL] Unexpected failure at outer loop for edge ({u}, {v}): {type(e).__name__} - {e}")
 
         # Generate an equal amount of negative labels for each type of edge
-        negative_edges_oo = generate_negative_edges(graph, new_edges_count['o-o-bank'], edge_type='o-o-bank', edgebank=all_edgebanks, old_nodes=old_training_nodes)
-        negative_edges_oon = generate_negative_edges(graph, new_edges_count['o-o-nobank'], edge_type='o-o-nobank', edgebank=all_edgebanks, old_nodes=old_training_nodes)
-        negative_edges_on = generate_negative_edges(graph, new_edges_count['o-n'], edge_type='o-n', edgebank=all_edgebanks, old_nodes=old_training_nodes)
-        negative_edges_nn = generate_negative_edges(graph, new_edges_count['n-n'], edge_type='n-n', edgebank=all_edgebanks, old_nodes=old_training_nodes)
+        negative_edges_oo = generate_negative_edges(graph, new_edges_count['o-o-bank'], edge_type='o-o-bank', 
+                                                    edgebank=all_edgebanks, old_nodes=old_training_nodes)
+        negative_edges_oon = generate_negative_edges(graph, new_edges_count['o-o-nobank'], edge_type='o-o-nobank', 
+                                                     edgebank=all_edgebanks, old_nodes=old_training_nodes)
+        negative_edges_on = generate_negative_edges(graph, new_edges_count['o-n'], edge_type='o-n', 
+                                                    edgebank=all_edgebanks, old_nodes=old_training_nodes)
+        negative_edges_nn = generate_negative_edges(graph, new_edges_count['n-n'], edge_type='n-n', 
+                                                    edgebank=all_edgebanks, old_nodes=old_training_nodes)
 
         tmp_samples_oo = [torch.tensor([u, v]) for u, v in negative_edges_oo]
         tmp_samples_oon = [torch.tensor([u, v]) for u, v in negative_edges_oon]
@@ -301,7 +305,9 @@ def generate_validation_data(training_graphs, old_training_nodes, all_edgebanks,
         sorted_samples['o-n']['y'][i].extend([0 for _ in range(len(negative_edges_on))])
         sorted_samples['n-n']['X'][i].extend(tmp_samples_nn)
         sorted_samples['n-n']['y'][i].extend([0 for _ in range(len(negative_edges_nn))])
-
+
+        old_training_nodes.update(graph.nodes())  # Add the old nodes
+
     return sorted_samples, new_edges_count
 
 def generate_validation_data_cached(training_graphs, old_training_nodes, all_edgebanks, MAX_SAMPLES, dataset, seed, type_data, saved_data_file_path):

diff --git a/GraphGeneration/scripts/process_data.py b/GraphGeneration/scripts/process_data.py
@@ -173,7 +173,7 @@ def build_edgebanks_from_start(graphs, days=5):
         curr_edgebank = {}
 
         # Add edges from all previous graphs (not the current graph)
-        for j in range(max(i - days, 0), i):  # Loop through all previous graphs (graphs 0 to i-1)
+        for j in range(max(i - days, 0), i):  # Loop through all previous graphs (graphs i - days to i-1)
             for u, v in graphs[j][-1].edges():  # Accessing the graph directly
                 u_key = u
                 v_key = v