diff --git a/GraphGeneration/encoder.yaml b/GraphGeneration/encoder.yaml new file mode 100644 index 0000000..53ab881 --- /dev/null +++ b/GraphGeneration/encoder.yaml @@ -0,0 +1,26 @@ +encoder_model: + name: topoGED + node2vec_setup: + node2vec_dimensions: 64 # We add features onto the end since Node2Vec doesn't embed features + node2vec_walk_length: 50 # Number of nodes visited per walk (Higher is more global, smaller is local) + node2vec_num_walks: 10 # Number of walks to start per node (Higher is more detailed and stable) + node2vec_p: 1.0 # Return parameter, the likelihood of revisiting a node (Higher is less backtracking) + node2vec_q: 1.0 # The walk bias for determining direction (Higher is more DFS-like; lower is BFS-like) + node2vec_window: 10 # The context size (Higher is broader learning) + node2vec_min_count: 1 # Minimum number of occurrences for a node to be considered (Higher will ignore more rare nodes) + node2vec_batch_words: 4 # The batch size for when Word2Vec is used (Higher will train faster; but with more memory) + node2vec_workers: 1 # Number of workers (threads) + hidden_dim: 64 + nodeEmbeddingType: "LSTM" + addOnFeature: "Position" + +decoder_model: + encode_links: "Concat" + +training: + batch_size: 64 # play around + lr: 0.001 + epochs: 500 + +dataset: CollegeMsg +seed: 1024 diff --git a/GraphGeneration/models/model.py b/GraphGeneration/models/model.py index dc454d1..49a1391 100644 --- a/GraphGeneration/models/model.py +++ b/GraphGeneration/models/model.py @@ -1,14 +1,9 @@ # Models in use -import os -import torch from GraphGeneration.models.MultiHeadedEdgePredictor import MultiHeadedEdgePredictor -from GraphGeneration.models.EdgePredictor import EdgePredictorMLP from GraphGeneration.models.temporal_gnn.script.utils.util import logger -from GraphGeneration.models.temporal_gnn.script.models.HTGN import HTGN -from GraphGeneration.models.GCLSTM import GCLSTM from GraphGeneration.models.SimpleNodeLSTM import SimpleNodeLSTM -def setupMLP(embedding_dim, embedding, mlpEncoding, embedOld): +def setupMLP(embedding_dim, mlpEncoding): """ Set up the MLP based on the arguments provided in the command line starter @@ -21,28 +16,15 @@ def setupMLP(embedding_dim, embedding, mlpEncoding, embedOld): input_dim = embedding_dim # Starting input dimension (two 32-dim node embeddings) # Set up the MLPs according to arguments - if embedOld == 'True': - flags = ['o-o-bank', 'o-o-nobank', 'o-n', 'n-n'] - else: - flags = ['o-o-nobank', 'o-n', 'n-n'] + flags = ['o-o-bank', 'o-o-nobank', 'o-n', 'n-n'] mlp = MultiHeadedEdgePredictor(in_channels=input_dim, hidden_channels=32, edge_types=flags, input_type=mlpEncoding) return mlp -def load_encoder_model(args, device, node2vec_dimensions, hidden_dim=64, HTGN_nodelist=[]): - if args.embeddingType == 'LSTM': +def load_encoder_model(encoder_config, device, node2vec_dimensions, hidden_dim=64, HTGN_nodelist=[]): + if encoder_config["encoder_model"]["nodeEmbeddingType"] == 'LSTM': model = SimpleNodeLSTM(input_dim=node2vec_dimensions, hidden_dim=hidden_dim).to(device) - elif args.embeddingType == 'GCLSTM': - model = GCLSTM(in_channels=node2vec_dimensions, hidden_channels=64).to(device) - model.device = device - elif args.embeddingType == 'HTGN': - args.num_nodes = len(HTGN_nodelist) - args.nfeat = node2vec_dimensions - args.nhid = 64 - args.nout = 64 - model = HTGN(args).to(device) - model.device = device else: raise Exception('pls define the model') - logger.info('using model {} '.format(args.embeddingType)) + logger.info('using model {} '.format(encoder_config["encoder_model"]["nodeEmbeddingType"])) return model, hidden_dim diff --git a/GraphGeneration/models/temporal_gnn/script/config.py b/GraphGeneration/models/temporal_gnn/script/config.py index 174e3a0..8bb01f6 100644 --- a/GraphGeneration/models/temporal_gnn/script/config.py +++ b/GraphGeneration/models/temporal_gnn/script/config.py @@ -50,14 +50,6 @@ parser.add_argument('--fixed_curvature', type=int, default=1, help='fixed (1) curvature or not (0)') parser.add_argument('--aggregation', type=str, default='deg', help='aggregation method: [deg, att]') -parser.add_argument("--strategy", type=str, required=False, default='MultiheadedMLP', choices=['MultiheadedMLP', 'SingleMLP'], help="The type of MLP NN to use") -parser.add_argument("--embedding", type=str, required=False, default='Position', choices=['Position', 'NodeType', 'Position+NodeType', 'None'], help="Allows appending positional encodings or an integer node type onto the end of the embeddings") -parser.add_argument("--mlpEncoding", type=str, required=False, default='Concat', choices=['Concat', 'Product', 'Addition', 'Subtraction'], help="How you want to input node embeddings to the MLP") # Product and addition lead to potential noise as we use directed graphs -parser.add_argument("--embedOld", type=str, required=False, default='True', choices=['True', 'False'], help="If you want to let the MLP predict edge type \'o-o-bank\', otherwise these edges are randomly added") -parser.add_argument("--oldDegree", type=str, required=False, default='False' ,choices=['True', 'False'], help="If you want reappearing nodes to reuse their most recent degree") -parser.add_argument("--trainingStyle", type=str, required=False, default='TrueGraphs', choices=['TrueGraphs', 'PredGraphs', 'MixedGraphs'], help="When training the MLP, decides if you use real graphs, predicted graphs (with first real as starter), or real then pred for MLP training") -parser.add_argument("--embeddingType", type=str, required=False, default='Node2Vec', choices=['Linear', 'Node2Vec', 'LSTM', 'GCLSTM', 'HTGN'], help="How nodes should be embedded. Either with Node2Vec or with a Linear mutliplication of adjacency matrix by node feature matrix") - # TopoGED mode parser.add_argument('--use_predict_probs', action='store_true', help='Use prediction probabilities to predict the next snapshot') parser.add_argument('--use_predict_graph_prediction', action='store_true', help='Use prediction graph description to predict the next snapshot') diff --git a/GraphGeneration/scripts/compute_embedding.py b/GraphGeneration/scripts/compute_embedding.py index f1ad371..e5ed15c 100644 --- a/GraphGeneration/scripts/compute_embedding.py +++ b/GraphGeneration/scripts/compute_embedding.py @@ -5,17 +5,11 @@ import torch from node2vec import Node2Vec from GraphGeneration.models.temporal_gnn.script.config import args +import yaml -# Node2Vec Parameters -node2vec_dimensions = args.nfeat # We add features onto the end since Node2Vec doesn't embed features -node2vec_walk_length = 50 # Number of nodes visited per walk (Higher is more global, smaller is local) -node2vec_num_walks = 10 # Number of walks to start per node (Higher is more detailed and stable) -node2vec_p = 1.0 # Return parameter, the likelihood of revisiting a node (Higher is less backtracking) -node2vec_q = 1.0 # The walk bias for determining direction (Higher is more DFS-like; lower is BFS-like) -node2vec_window = 10 # The context size (Higher is broader learning) -node2vec_min_count = 1 # Minimum number of occurrences for a node to be considered (Higher will ignore more rare nodes) -node2vec_batch_words = 4 # The batch size for when Word2Vec is used (Higher will train faster; but with more memory) -node2vec_workers = 1 # Number of workers (threads) +# Load YAML config +with open("GraphGeneration/encoder.yaml", "r") as file: + encoder_config = yaml.safe_load(file) def compute_linear_gnn_embeddings(G: nx.DiGraph, device): """ @@ -62,19 +56,20 @@ def compute_node2vec_embeddings(G: nx.DiGraph, device): """ node2vec = Node2Vec( G, - dimensions=node2vec_dimensions, - walk_length=node2vec_walk_length, - num_walks=node2vec_num_walks, - workers=node2vec_workers, - p=node2vec_p, - q=node2vec_q, + dimensions=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"], + walk_length=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_walk_length"], + num_walks=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_num_walks"], + workers=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_workers"], + p=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_p"], + q=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_q"], quiet=True ) model = node2vec.fit( - window=node2vec_window, - min_count=node2vec_min_count, - batch_words=node2vec_batch_words + window=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_window"], + min_count=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_min_count"], + batch_words=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_batch_words"], + workers=1 ) # Perform Node2Vec # Used to generate an embedding for isolated nodes @@ -111,7 +106,8 @@ def compute_node_embeddings_LSTM(graph_snapshots, lstm_model, device): # Collect per-timestep node embeddings node_history = defaultdict(list) old_nodes = set() - null_embed = torch.tensor([0]*(node2vec_dimensions), dtype=torch.float32).to(device) + null_embed = torch.tensor([0]*(encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"]), + dtype=torch.float32).to(device) for G in graph_snapshots: snapshot_embeddings = compute_node2vec_embeddings(G, device) for node, emb in snapshot_embeddings.items(): @@ -151,7 +147,7 @@ def get_GCN_data(graph_snapshots): x_list = [] edge_index_list = [] - F = node2vec_dimensions # number of features per node (change this if you want more features) + F = encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"] # number of features per node (change this if you want more features) for G in graph_snapshots: node2vec_embeddings = compute_node2vec_embeddings(G) diff --git a/GraphGeneration/scripts/load_data.py b/GraphGeneration/scripts/load_data.py index 5bffaea..514a512 100644 --- a/GraphGeneration/scripts/load_data.py +++ b/GraphGeneration/scripts/load_data.py @@ -10,7 +10,7 @@ from utils.loader import Loader -def load_data(dataset, strategy, embedding, mlpEncoding, embedOld, trainingStyle, embeddingType): +def load_data(dataset, embedding, mlpEncoding, embeddingType): my_loader = Loader() output_dir = os.path.abspath(f'data/input/cached/{dataset}') @@ -19,14 +19,14 @@ def load_data(dataset, strategy, embedding, mlpEncoding, embedOld, trainingStyle cached_data_dataset_folder = os.path.join(output_dir, 'saved_data/') # Construct output evaluation csv - structure_pred_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/structure_pred.csv' - structure_true_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/structure_true.csv' - structure_diff_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/structure_diff.csv' - kernel_pred_file_path = f'GraphGeneration/output/results/kernel/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/kernel_pred.csv' - kernel_true_file_path = f'GraphGeneration/output/results/kernel/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/kernel_true.csv' - edge_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/edge_analysis.csv' - topER_file_path = f'GraphGeneration/output/results/topER/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/toper_diff.csv' - animation_path = f'GraphGeneration/output/results/animations/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/pred_vs_true.mp4' + structure_pred_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/structure_pred.csv' + structure_true_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/structure_true.csv' + structure_diff_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/structure_diff.csv' + kernel_pred_file_path = f'GraphGeneration/output/results/kernel/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/kernel_pred.csv' + kernel_true_file_path = f'GraphGeneration/output/results/kernel/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/kernel_true.csv' + edge_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/edge_analysis.csv' + topER_file_path = f'GraphGeneration/output/results/topER/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/toper_diff.csv' + animation_path = f'GraphGeneration/output/results/animations/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/pred_vs_true.mp4' # Create file paths if needed for path in [structure_pred_file_path, structure_true_file_path, structure_diff_file_path, kernel_pred_file_path, @@ -282,10 +282,14 @@ def generate_validation_data(training_graphs, old_training_nodes, all_edgebanks, print(f"[FATAL] Unexpected failure at outer loop for edge ({u}, {v}): {type(e).__name__} - {e}") # Generate an equal amount of negative labels for each type of edge - negative_edges_oo = generate_negative_edges(graph, new_edges_count['o-o-bank'], edge_type='o-o-bank', edgebank=all_edgebanks, old_nodes=old_training_nodes) - negative_edges_oon = generate_negative_edges(graph, new_edges_count['o-o-nobank'], edge_type='o-o-nobank', edgebank=all_edgebanks, old_nodes=old_training_nodes) - negative_edges_on = generate_negative_edges(graph, new_edges_count['o-n'], edge_type='o-n', edgebank=all_edgebanks, old_nodes=old_training_nodes) - negative_edges_nn = generate_negative_edges(graph, new_edges_count['n-n'], edge_type='n-n', edgebank=all_edgebanks, old_nodes=old_training_nodes) + negative_edges_oo = generate_negative_edges(graph, new_edges_count['o-o-bank'], edge_type='o-o-bank', + edgebank=all_edgebanks, old_nodes=old_training_nodes) + negative_edges_oon = generate_negative_edges(graph, new_edges_count['o-o-nobank'], edge_type='o-o-nobank', + edgebank=all_edgebanks, old_nodes=old_training_nodes) + negative_edges_on = generate_negative_edges(graph, new_edges_count['o-n'], edge_type='o-n', + edgebank=all_edgebanks, old_nodes=old_training_nodes) + negative_edges_nn = generate_negative_edges(graph, new_edges_count['n-n'], edge_type='n-n', + edgebank=all_edgebanks, old_nodes=old_training_nodes) tmp_samples_oo = [torch.tensor([u, v]) for u, v in negative_edges_oo] tmp_samples_oon = [torch.tensor([u, v]) for u, v in negative_edges_oon] @@ -301,7 +305,9 @@ def generate_validation_data(training_graphs, old_training_nodes, all_edgebanks, sorted_samples['o-n']['y'][i].extend([0 for _ in range(len(negative_edges_on))]) sorted_samples['n-n']['X'][i].extend(tmp_samples_nn) sorted_samples['n-n']['y'][i].extend([0 for _ in range(len(negative_edges_nn))]) - + + old_training_nodes.update(graph.nodes()) # Add the old nodes + return sorted_samples, new_edges_count def generate_validation_data_cached(training_graphs, old_training_nodes, all_edgebanks, MAX_SAMPLES, dataset, seed, type_data, saved_data_file_path): diff --git a/GraphGeneration/scripts/process_data.py b/GraphGeneration/scripts/process_data.py index 0e9a013..daa4105 100644 --- a/GraphGeneration/scripts/process_data.py +++ b/GraphGeneration/scripts/process_data.py @@ -173,7 +173,7 @@ def build_edgebanks_from_start(graphs, days=5): curr_edgebank = {} # Add edges from all previous graphs (not the current graph) - for j in range(max(i - days, 0), i): # Loop through all previous graphs (graphs 0 to i-1) + for j in range(max(i - days, 0), i): # Loop through all previous graphs (graphs i - days to i-1) for u, v in graphs[j][-1].edges(): # Accessing the graph directly u_key = u v_key = v diff --git a/GraphGeneration/scripts/topoGED_end_to_end.py b/GraphGeneration/scripts/topoGED_end_to_end.py index 8e7941d..d3bbedb 100644 --- a/GraphGeneration/scripts/topoGED_end_to_end.py +++ b/GraphGeneration/scripts/topoGED_end_to_end.py @@ -10,11 +10,12 @@ import pandas as pd import os import sys +import yaml sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) from GraphGeneration.utils.Evaluator import Evaluator -from GraphGeneration.models.temporal_gnn.script.config import args from load_data import load_data, generate_training_data_cached, generate_validation_data_cached +from GraphGeneration.utils.casting_type import to_tensor from GraphGeneration.utils.sampling_edges_utils import predict_edges from GraphGeneration.utils.casting_type import to_tensor from GraphGeneration.utils.graph_construction_utils import compute_reappearance_probabilities, get_node_features, update_degrees @@ -24,18 +25,13 @@ from GraphGeneration.models.model import setupMLP, load_encoder_model # Import all node embedding methods -from compute_embedding import compute_embedding, node2vec_batch_words +from compute_embedding import compute_embedding from process_data import modifyGraphIds, build_edgebanks_from_start from torch.utils.data import DataLoader # Import Loss fn from GraphGeneration.scripts.composite_graphlet_loss_fn import GraphletLoss -# Set seeds -global_seed = args.seed -random.seed(global_seed) -np.random.seed(global_seed) - # Set up device try: if torch.cuda.is_available(): @@ -48,46 +44,61 @@ device = torch.device("cpu") print("Using CPU") +# Load YAML config +with open("GraphGeneration/encoder.yaml", "r") as file: + encoder_config = yaml.safe_load(file) + print(encoder_config) + +# Set seeds +random.seed(encoder_config["seed"]) +np.random.seed(encoder_config["seed"]) + class Runner(object): def __init__(self): - self.seed = global_seed + self.seed = encoder_config["seed"] # Set up Evaluator self.evaluator = Evaluator() # Some default file path self.file_visualization_path = "GraphGeneration/scripts/Visualize" - self.saved_input = os.path.abspath(f'data/input/cached/{args.dataset}/saved_data') - common_suffix = f"multiMLP_{args.strategy}_embedding{args.embedding}_mlpEncoding{args.mlpEncoding}_embeddingType{args.embeddingType}" - self.structure_dir = f"GraphGeneration/output/results/structure/{args.dataset}/{common_suffix}" - self.kernel_dir = f"GraphGeneration/output/results/kernel/{args.dataset}/{common_suffix}" - self.topER_dir = f"GraphGeneration/output/results/topER/{args.dataset}/{common_suffix}" - + self.saved_input = os.path.abspath(f'data/input/cached/{encoder_config["dataset"]}/saved_data') + common_suffix = f'topoGED_embedding{encoder_config["encoder_model"]["addOnFeature"]}_mlpEncoding{encoder_config["decoder_model"]["encode_links"]}_embeddingType{encoder_config["encoder_model"]["nodeEmbeddingType"]}' + self.structure_dir = f'GraphGeneration/output/results/structure/{encoder_config["dataset"]}/{common_suffix}' + self.kernel_dir = f'GraphGeneration/output/results/kernel/{encoder_config["dataset"]}/{common_suffix}' + self.topER_dir = f'GraphGeneration/output/results/topER/{encoder_config["dataset"]}/{common_suffix}' + + save_dir = os.path.join(self.file_visualization_path, encoder_config["dataset"], encoder_config["encoder_model"]["nodeEmbeddingType"]) + os.makedirs(save_dir, exist_ok=True) + # Current target snapshot we want to predict self.current_target_snapshot = 2 # All the edge types self.all_edge_types = ['o-o-bank', 'o-o-nobank', 'o-n', 'n-n'] + self.best_validation_model_auc = 0 # Load the global encoder & decoder model - self.encoder_model, input_dim = load_encoder_model(args, device=device, node2vec_dimensions=args.nfeat, hidden_dim=64) + self.encoder_model, self.input_dim = load_encoder_model(encoder_config, device=device, node2vec_dimensions=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"], + hidden_dim=encoder_config["encoder_model"]["hidden_dim"]) # Check if there is any add-on features we will plug at the end of encoder embedding - if args.embedding in ['NodeType', 'Position']: - input_dim += 1 - - self.link_prediction_decoder = setupMLP(embedding_dim=input_dim*2, embedding=args.embedding, mlpEncoding=args.mlpEncoding, embedOld=args.embedOld) + if encoder_config["encoder_model"]["addOnFeature"] in ['NodeType', 'Position']: + self.input_dim += 1 + + self.link_prediction_decoder = setupMLP(embedding_dim=self.input_dim*2, mlpEncoding=encoder_config["decoder_model"]["encode_links"]) self.link_prediction_decoder.to(device) # Load all the snapshot true data - self.probabilities, self.graph_descriptions, self.thresholds, self.target_graphs = load_data(args.dataset, args.strategy, args.embedding, args.mlpEncoding, args.embedOld, args.trainingStyle, args.embeddingType) + self.probabilities, self.graph_descriptions, self.thresholds, self.target_graphs = load_data(encoder_config["dataset"], encoder_config["encoder_model"]["addOnFeature"], + encoder_config["decoder_model"]["encode_links"], encoder_config["encoder_model"]["nodeEmbeddingType"]) # Modify the graph ids to 1,2,3,... self.target_graphs, _ = modifyGraphIds(self.target_graphs, self.thresholds) # Build the edgebanks for construction self.all_edgebanks = build_edgebanks_from_start(self.target_graphs) - + # Reshape the graph description self.graph_descriptions = [list(zip(graph_description[0::3], graph_description[1::3], graph_description[2::3])) for graph_description in self.graph_descriptions] @@ -96,14 +107,171 @@ def __init__(self): # Convert number of snapshots to integer self.num_snapshots = len(self.probabilities) self.train_end = int(0.8 * self.num_snapshots) - val_end = int(0.9 * self.num_snapshots) + self.val_end = int(0.9 * self.num_snapshots) # Assign snapshots self.training_graphs = [self.target_graphs[i][-1] for i in range(self.train_end)] - self.validation_graphs = [self.target_graphs[i][-1] for i in range(self.train_end, val_end)] - self.test_graphs = [self.target_graphs[i][-1] for i in range(val_end, self.num_snapshots)] + self.validation_graphs = [self.target_graphs[i][-1] for i in range(self.train_end, self.val_end)] + self.test_graphs = [self.target_graphs[i][-1] for i in range(self.val_end, self.num_snapshots)] + + # ======================= HELPER FUNCTIONS ======================= + def sample_old_nodes(self, prev_graphs, current_target_snapshot): + random.seed(self.seed) + np.random.seed(self.seed) + + # Sample old nodes + probs = compute_reappearance_probabilities(graphs=prev_graphs, t_curr=current_target_snapshot) + node_ids = list(probs.keys()) + weights = list(probs.values()) + sampled_old_nodes = list(np.random.choice(node_ids, size=self.current_target_count_old_nodes, replace=False, p=np.array(weights)/np.sum(weights))) # Makes sure that we select only unique nodes each time + + return set(sampled_old_nodes) + # ======================= TRAIN MODEL ======================= + def run_validation(self, validation_samples, batch_size, epoch): + train_auc = { + 'o-o-bank': [], + 'o-o-nobank': [], + 'o-n': [], + 'n-n': [], + } + # For computing AUC Scores + train_preds = [] + train_labels = [] + + for i in range(self.val_end): + snapshot = self.train_end + i + self.encoder_model.eval() + self.link_prediction_decoder.eval() + with torch.no_grad(): + print("INFO: Validation on snapshot", snapshot) + + # Prepare current target graph count + self.current_target_count_old_nodes = self.probabilities[snapshot][0] + self.current_target_count_new_nodes = self.probabilities[snapshot][1] + self.current_target_count = { + edge_type: self.probabilities[snapshot][j + 2] + for j, edge_type in enumerate(self.all_edge_types) + } + + node_types = { + "old_nodes": self.sample_old_nodes(self.training_graphs[:snapshot], snapshot), + "new_nodes": set() + } + + constructing_graph = nx.DiGraph() # Graph we try to predict + + # Adding old nodes to constructing_graph + constructing_graph.add_nodes_from(node_types['old_nodes']) + + for flag in self.all_edge_types: + curr_X_train = validation_samples[flag]['X'][i] + curr_y_train = validation_samples[flag]['y'][i] + + if len(curr_X_train) == 0 or len(curr_y_train) == 0: + print(f'No samples for edge type: {flag}') + continue + + curr_X_train = [x.cpu().detach().numpy() if torch.is_tensor(x) else x for x in curr_X_train] + curr_X_train = np.array(curr_X_train) + curr_y_train = np.array(curr_y_train) + + X_train_curr, curr_y_train = shuffle(curr_X_train, curr_y_train, random_state=self.seed) + temp_X_train = torch.tensor(X_train_curr, dtype=torch.float32).to(device) + temp_y_train = torch.tensor(curr_y_train, dtype=torch.float32).to(device) + train_loader = DataLoader(TensorDataset(temp_X_train, temp_y_train), batch_size=batch_size, shuffle=True) + + # Training graphs for predicting current snapshot + validation_graphs = self.training_graphs + self.validation_graphs[:i] + + for (x, y) in train_loader: + node_embeddings = compute_embedding(embeddingType=encoder_config["encoder_model"]["nodeEmbeddingType"], graphs=validation_graphs, encoder_model=self.encoder_model, device=device) + + # Get current embeddings + src_nodes = [int(n) for n in x[:, 0].tolist()] + dst_nodes = [int(n) for n in x[:, 1].tolist()] + + # Add new nodes to the node_types + for n in src_nodes: + if n not in node_embeddings and flag in ['o-n', 'n-n']: + node_types["new_nodes"].add(n) + constructing_graph.add_node(n) + node_embeddings[n] = torch.zeros(self.input_dim, device=device) + + for n in dst_nodes: + if n not in node_embeddings and flag in ['o-n', 'n-n']: + node_types["new_nodes"].add(n) + constructing_graph.add_node(n) + node_embeddings[n] = torch.zeros(self.input_dim, device=device) + + src_embed = torch.stack([ + node_embeddings[n] for n in src_nodes + ]) + + dst_embed = torch.stack([ + node_embeddings[n] for n in dst_nodes + ]) + + if src_embed.dim() == 1: + src_embed = src_embed.unsqueeze(1) + if dst_embed.dim() == 1: + dst_embed = dst_embed.unsqueeze(1) + + preds = self.link_prediction_decoder(src_embed=src_embed, dst_embed=dst_embed, edge_type=flag) + + if preds.dim() == 0: + preds = preds.unsqueeze(0) + if y.dim() == 0: # scalar value like torch.tensor(0.5) + y = y.unsqueeze(0) # make it [1] + elif y.dim() == 2 and y.size(1) == 1: # shape [batch_size, 1] + y = y.view(-1) + + # Add to our labels for evaluation + train_preds.extend(preds.detach().cpu().numpy()) + train_labels.extend(y.detach().cpu().numpy()) + break + + # Assign embeddings for all the training_nodes + curr_embeddings = compute_embedding(embeddingType=encoder_config["encoder_model"]["nodeEmbeddingType"], graphs=validation_graphs, encoder_model=self.encoder_model, device=device) + constructing_graph = get_node_features(constructing_graph.copy(), self.training_graphs + self.validation_graphs[:i], self.thresholds, self.graph_descriptions[snapshot], node_types["old_nodes"], node_types["new_nodes"]) + sampled_edges = predict_edges(constructing_graph, edge_type=flag, node_types=node_types, edgebank=self.all_edgebanks[snapshot], link_prediction_decoder=self.link_prediction_decoder, + old_node_embeddings=curr_embeddings, top_k=self.current_target_count[flag], graph_num=snapshot, device=device) + constructing_graph.add_edges_from(list(sampled_edges)) + update_degrees(constructing_graph) + + # Update the training_graphs to involve with the constructing graph + if flag == self.all_edge_types[0]: + validation_graphs.append(constructing_graph) + else: + validation_graphs[-1] = constructing_graph + + if len(np.unique(train_labels)) < 2: + train_auc[flag].append(0) + else: + train_auc[flag].append(roc_auc_score(train_labels, train_preds)) # Calculate scores + + # Record the Training Loss, AUC + current_model_auc = 0 #we take average of all edge types + + for flag in self.all_edge_types: + gpu_mem_alloc = torch.cuda.max_memory_allocated() / 1000000 if torch.cuda.is_available() else 0 + epochMessage = f"Epoch {epoch+1:02d} | Edge Type: {flag} | Validation AUCROC {np.mean(train_auc[flag]):.4f} | GPU: {gpu_mem_alloc:.1f}MiB" + current_model_auc += np.mean(train_auc[flag]) + print(epochMessage) + with open(rf'{self.file_visualization_path}\{encoder_config["dataset"]}\{encoder_config["encoder_model"]["nodeEmbeddingType"]}\multiheadMLP_performance.txt', "a") as f: + f.write(epochMessage + "\n") + + # We check and cache if it has the best auc + if current_model_auc/4 >= self.best_validation_model_auc: + self.best_validation_model_auc = current_model_auc + + print("INFO: Saving the model...") + torch.save(self.link_prediction_decoder.state_dict(), self.model_path) + torch.save(self.encoder_model.state_dict(), self.model_path) + print("INFO: The model is saved. Done.") + + def train_multi_head(self, training_samples, validation_samples, epochs=250, batch_size=64, training_new_edges_count=0): """ Train a MultiHeaded MLP Neural Network for use in edge predictions @@ -112,19 +280,17 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat model (MultiheadedMLP): The Multiheaded MLP to train now training_samples: The dictionary store the pos, neg edges of each snapshot, using for training validation_samples: The dictionary store the pos, neg edges of each snapshot, using for validation - epochs (int): The number of epochs to train for - batch_size (int): The batch size to use for the training data Returns: link_prediction_decoder (Multiheaded MLP): The trained MLP """ - lr = args.lr + lr = encoder_config["training"]["lr"] self.link_prediction_decoder.train() optimizer = torch.optim.Adam(list(self.encoder_model.parameters()) + list(self.link_prediction_decoder.parameters()), lr=lr) loss_fn = nn.BCELoss() graphlet_loss_fn = GraphletLoss() # Train - for epoch in range(epochs): + for epoch in range(encoder_config["training"]["epochs"]): train_loss = { 'o-o-bank': [], 'o-o-nobank': [], @@ -141,14 +307,9 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat train_preds = [] train_labels = [] - for snapshot in range(2, len(self.training_graphs)): + for snapshot in range(2, 16): print("INFO: Training on snapshot", snapshot) - node_types = { - "old_nodes": set().union(*(graph.nodes() for graph in self.training_graphs[:snapshot])), - "new_nodes": set() - } - # Prepare current target graph count self.current_target_count_old_nodes = self.probabilities[snapshot][0] self.current_target_count_new_nodes = self.probabilities[snapshot][1] @@ -157,6 +318,11 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat for j, edge_type in enumerate(self.all_edge_types) } + node_types = { + "old_nodes": self.sample_old_nodes(self.training_graphs[:snapshot], snapshot), + "new_nodes": set() + } + constructing_graph = nx.DiGraph() # Graph we try to predict # Adding old nodes to constructing_graph @@ -177,14 +343,14 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat X_train_curr, curr_y_train = shuffle(curr_X_train, curr_y_train, random_state=self.seed) temp_X_train = torch.tensor(X_train_curr, dtype=torch.float32).to(device) temp_y_train = torch.tensor(curr_y_train, dtype=torch.float32).to(device) - train_loader = DataLoader(TensorDataset(temp_X_train, temp_y_train), batch_size=batch_size, shuffle=True) + train_loader = DataLoader(TensorDataset(temp_X_train, temp_y_train), batch_size=encoder_config["training"]["batch_size"], shuffle=True) # Training graphs for predicting current snapshot training_graphs = self.training_graphs[:snapshot] for (x, y) in train_loader: optimizer.zero_grad() - node_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=training_graphs, encoder_model=self.encoder_model, device=device) + node_embeddings = compute_embedding(embeddingType=encoder_config["encoder_model"]["nodeEmbeddingType"], graphs=training_graphs, encoder_model=self.encoder_model, device=device) # Get current embeddings src_nodes = [int(n) for n in x[:, 0].tolist()] @@ -206,19 +372,25 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat constructing_graph.add_node(n) node_embeddings[n] = torch.zeros(embed_dim, device=node_embeddings[any_node].device) + # Assign embeddings for source nodes + # Assign zero vectors for new nodes for on and nn edge type src_embed = torch.stack([ node_embeddings[n] for n in src_nodes ]) + # Assign embeddings for dest nodes + # Assign zero vectors for new nodes for on and nn edge type dst_embed = torch.stack([ node_embeddings[n] for n in dst_nodes ]) + # Converting dim if src_embed.dim() == 1: src_embed = src_embed.unsqueeze(1) if dst_embed.dim() == 1: dst_embed = dst_embed.unsqueeze(1) + # Get predictions for link preds = self.link_prediction_decoder(src_embed=src_embed, dst_embed=dst_embed, edge_type=flag) if preds.dim() == 0: @@ -235,7 +407,6 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat graphlet_loss = graphlet_loss_fn(to_tensor(pred_kernel, device=device).unsqueeze(0), to_tensor(true_kernel, device=device).unsqueeze(0)) loss = 0.5*loss_fn(preds, y) + 0.5*graphlet_loss - # loss = loss_fn(preds, y) loss.backward() optimizer.step() train_loss[flag].append(loss.item()) @@ -244,8 +415,8 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat train_preds.extend(preds.detach().cpu().numpy()) train_labels.extend(y.detach().cpu().numpy()) - # Assign embeddings for all the training_nodes - curr_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=training_graphs, encoder_model=self.encoder_model, device=device) + # Constructing temp graph + curr_embeddings = compute_embedding(embeddingType=encoder_config["encoder_model"]["nodeEmbeddingType"], graphs=training_graphs, encoder_model=self.encoder_model, device=device) constructing_graph = get_node_features(constructing_graph.copy(), self.training_graphs[:snapshot], self.thresholds, self.graph_descriptions[snapshot], node_types["old_nodes"], node_types["new_nodes"]) sampled_edges = predict_edges(constructing_graph, edge_type=flag, node_types=node_types, edgebank=self.all_edgebanks[snapshot], link_prediction_decoder=self.link_prediction_decoder, old_node_embeddings=curr_embeddings, top_k=self.current_target_count[flag], graph_num=snapshot, device=device) @@ -253,25 +424,30 @@ def train_multi_head(self, training_samples, validation_samples, epochs=250, bat update_degrees(constructing_graph) # Update the training_graphs to involve with the constructing graph - if len(training_graphs) == snapshot: + if flag == self.all_edge_types[0]: training_graphs.append(constructing_graph) else: training_graphs[-1] = constructing_graph if len(np.unique(train_labels)) < 2: - train_auc.append(0) + train_auc[flag].append(0) else: train_auc[flag].append(roc_auc_score(train_labels, train_preds)) # Calculate scores + # Validation + self.run_validation(validation_samples=validation_samples, batch_size=encoder_config["training"]["batch_size"], epoch=epoch) + + # Record the Training Loss, AUC + gpu_mem_alloc = torch.cuda.max_memory_allocated() / 1000000 if torch.cuda.is_available() else 0 for flag in self.all_edge_types: - if (epoch + 1) % 100 == 0 or epoch == 0: - epochMessage = f"Epoch {epoch+1:02d} | Edge Type: {flag} | Train Loss: {np.mean(train_loss[flag]):.4f} | Train AUCROC {np.mean(train_auc[flag]):.4f}" + if (epoch + 1) % 20 == 0 or epoch == 0: + epochMessage = f"Epoch {epoch+1:02d} | Edge Type: {flag} | Train Loss: {np.mean(train_loss[flag]):.4f} | Train AUCROC {np.mean(train_auc[flag]):.4f} | GPU: {gpu_mem_alloc:.1f}MiB" print(epochMessage) - with open(rf"{self.file_visualization_path}\{args.dataset}\{args.embeddingType}\multiheadMLP_performance.txt", "a") as f: + with open(rf'{self.file_visualization_path}\{encoder_config["dataset"]}\{encoder_config["encoder_model"]["nodeEmbeddingType"]}\multiheadMLP_performance.txt', "a") as f: f.write(epochMessage + "\n") - return self.link_prediction_decoder + return self.link_prediction_decoder, self.encoder_model def train_models(self): """ @@ -290,23 +466,23 @@ def train_models(self): # Prepare training data training_sorted_samples, training_new_edges_count = generate_training_data_cached(training_graphs=self.training_graphs, - all_edgebanks=self.all_edgebanks, MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=global_seed, saved_data_file_path=self.saved_input) + all_edgebanks=self.all_edgebanks, MAX_SAMPLES=MAX_SAMPLES, dataset=encoder_config["dataset"], seed=self.seed, saved_data_file_path=self.saved_input) # Prepare validation data # We pass all_edgebanks of the training snapshots edgebanks validation_sorted_samples, training_new_edges_count = generate_validation_data_cached(training_graphs=self.validation_graphs, old_training_nodes=old_training_nodes, - all_edgebanks=self.all_edgebanks[self.train_end], MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=global_seed, type_data="validation", saved_data_file_path=self.saved_input) + all_edgebanks=self.all_edgebanks[self.train_end], MAX_SAMPLES=MAX_SAMPLES, dataset=encoder_config["dataset"], seed=self.seed, type_data="validation", saved_data_file_path=self.saved_input) # Prepare test data # We pass all_edgebanks of the training snapshots edgebanks test_sorted_samples, training_new_edges_count = generate_validation_data_cached(training_graphs=self.test_graphs, old_training_nodes=old_training_nodes, - all_edgebanks=self.all_edgebanks[self.train_end], MAX_SAMPLES=MAX_SAMPLES, dataset=args.dataset, seed=global_seed, type_data="test", saved_data_file_path=self.saved_input) + all_edgebanks=self.all_edgebanks[self.val_end], MAX_SAMPLES=MAX_SAMPLES, dataset=encoder_config["dataset"], seed=self.seed, type_data="test", saved_data_file_path=self.saved_input) print('Training') self.link_prediction_decoder = self.train_multi_head(training_samples=training_sorted_samples, validation_samples=validation_sorted_samples, - epochs=500, batch_size=64, training_new_edges_count=training_new_edges_count) + training_new_edges_count=training_new_edges_count) - return self.link_prediction_decoder + return self.link_prediction_decoder, self.encoder_model # ======================= BUILD GRAPH ======================= def build_accumulating_filtration_sequence_with_edgebank(self, current_target_snapshot): @@ -332,6 +508,9 @@ def build_accumulating_filtration_sequence_with_edgebank(self, current_target_sn current_target_graph_description = self.graph_descriptions[current_target_snapshot] prev_graphs = [graph[-1] for graph in self.target_graphs[:current_target_snapshot]] + # Prepare the graphs we have known + known_graphs = self.training_graphs[:self.current_target_snapshot] + V_total = int(current_target_graph_description[-1][0]) E_total = int(current_target_graph_description[-1][1]) W_total = current_target_graph_description[-1][2] @@ -365,7 +544,7 @@ def build_accumulating_filtration_sequence_with_edgebank(self, current_target_sn constructing_graph = get_node_features(constructing_graph, prev_graphs, self.thresholds, current_target_graph_description, old_nodes, new_nodes) # Assign embeddings for all the training_nodes - curr_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=prev_graphs, encoder_model=self.encoder_model, device=device) + curr_embeddings = compute_embedding(embeddingType=encoder_config["encoder_model"]["nodeEmbeddingType"], graphs=prev_graphs, encoder_model=self.encoder_model, device=device) # Assign zero vector for new nodes for new_node in new_nodes: @@ -382,7 +561,7 @@ def build_accumulating_filtration_sequence_with_edgebank(self, current_target_sn constructing_graph.add_edges_from(sampled_edges) update_degrees(constructing_graph) - new_embeddings = compute_embedding(embeddingType=args.embeddingType, graphs=prev_graphs + [constructing_graph], encoder_model=self.encoder_model, device=device) + new_embeddings = compute_embedding(embeddingType=encoder_config["encoder_model"]["nodeEmbeddingType"], graphs=prev_graphs + [constructing_graph], encoder_model=self.encoder_model, device=device) curr_embeddings.update(new_embeddings) # Recompute old node embeddings edge_pool = edge_pool + sampled_edges @@ -426,7 +605,7 @@ def evaluate(self, pred_graph, true_graph, node_types): on_kl_divergence_results = self.evaluator.kl_divergence_graphs(pred_on_graph, true_on_graph, mode="total") - with open(rf"{self.file_visualization_path}\{args.dataset}\{args.embeddingType}\kl_results_on.txt", "a") as f: + with open(rf'{self.file_visualization_path}\{encoder_config["dataset"]}\{encoder_config["encoder_model"]["nodeEmbeddingType"]}\kl_results_on.txt', "a") as f: f.write(f"{self.current_target_snapshot + 1}, {on_kl_divergence_results:.6f}\n") # Evaluate the graph of n-n @@ -435,7 +614,7 @@ def evaluate(self, pred_graph, true_graph, node_types): nn_kl_divergence_results = self.evaluator.kl_divergence_graphs(pred_nn_graph, true_nn_graph, mode="total") - with open(rf"{self.file_visualization_path}\{args.dataset}\{args.embeddingType}\kl_results_nn.txt", "a") as f: + with open(rf'{self.file_visualization_path}\{encoder_config["dataset"]}\{encoder_config["encoder_model"]["nodeEmbeddingType"]}\kl_results_nn.txt', "a") as f: f.write(f"{self.current_target_snapshot + 1}, {nn_kl_divergence_results:.6f}\n") # Evaluate the graph of old nodes @@ -454,8 +633,8 @@ def evaluate(self, pred_graph, true_graph, node_types): pd.DataFrame([true_kernel]).to_csv(f"{self.kernel_dir}/kernel_true.csv", mode='a', header=False, index=False) def run(self): - print("INFO: Dataset: {}".format(args.dataset)) - encoder_model_path = os.path.join(self.saved_input, rf"saved_models/encoder_{args.embeddingType}_{self.seed}") + print("INFO: Dataset: {}".format(encoder_config["dataset"])) + encoder_model_path = os.path.join(self.saved_input, rf'saved_models/encoder_{encoder_config["encoder_model"]["nodeEmbeddingType"]}_{self.seed}') decoder_model_path = os.path.join(self.saved_input, rf"saved_data/decoder_MLP_{self.seed}") if os.path.exists(encoder_model_path) and os.path.exists(decoder_model_path): @@ -467,19 +646,13 @@ def run(self): self.link_prediction_decoder.eval() self.encoder_model.eval() - print(f"✅ Link Prediction Decoder loaded from: {decoder_model_path}") - print(f"✅ Ecoder loaded from: {encoder_model_path}") + print(f"Link Prediction Decoder loaded from: {decoder_model_path}") + print(f"Encoder loaded from: {encoder_model_path}") else: # Train the Decoder and Encoder model print('Training the Link Prediction Decoder and Encoder') - self.link_prediction_decoder = self.train_models() + self.link_prediction_decoder, self.encoder_model = self.train_models() print('Finished training the Link Prediction Decoder and Encoder; Start Graph Construction') - - # saving the trained model - print("INFO: Saving the model...") - torch.save(self.link_prediction_decoder.state_dict(), self.model_path) - torch.save(self.encoder_model.state_dict(), self.model_path) - print("INFO: The model is saved. Done.") # Old graphs that we know up to now self.old_graphs = [self.target_graphs[0], self.target_graphs[1]] @@ -517,4 +690,4 @@ def run(self): runner.run() # To run the script -# python GraphGeneration/scripts/topoGED_end_to_end.py --embeddingType=LSTM --dataset=CollegeMsg --nfeat=64 \ No newline at end of file +# python GraphGeneration/scripts/topoGED_end_to_end.py \ No newline at end of file diff --git a/GraphGeneration/utils/graph_construction_utils.py b/GraphGeneration/utils/graph_construction_utils.py index 8c6513a..878a2a9 100644 --- a/GraphGeneration/utils/graph_construction_utils.py +++ b/GraphGeneration/utils/graph_construction_utils.py @@ -44,7 +44,7 @@ def compute_reappearance_probabilities(graphs, t_curr, decay_factor=3.0, alpha=1 return probs -def get_node_features(constructing_graph, previous_graphs, thresholds, embedding, old_nodes, new_nodes): +def get_node_features(constructing_graph, previous_graphs, thresholds, graph_description, old_nodes, new_nodes): """ Assign the maximum degree of a node, either using its last seen degree (if args.oldDegree == True) or randomly giving it one @@ -78,7 +78,7 @@ def get_node_features(constructing_graph, previous_graphs, thresholds, embedding degree = G.degree(node) existing_nodes[node] = (t, degree) - degree_counts = [embedding[i][0] for i in range(0, len(embedding))] + degree_counts = [graph_description[i][0] for i in range(0, len(graph_description))] degree_dict = {thresholds[i]: degree_counts[i] for i in range(len(thresholds))} @@ -86,7 +86,7 @@ def get_node_features(constructing_graph, previous_graphs, thresholds, embedding for degree, count in degree_dict.items(): degree_assignment.extend([degree] * count) - + # print(degree_assignment) random.shuffle(degree_assignment) @@ -97,13 +97,9 @@ def get_node_features(constructing_graph, previous_graphs, thresholds, embedding suitable_degrees = [d for d in degree_assignment if d >= old_degree] if suitable_degrees: assigned_degree = min(suitable_degrees) + degree_assignment.remove(assigned_degree) else: assigned_degree = degree_assignment.pop() - - if not degree_assignment: - pass - else: - degree_assignment.remove(assigned_degree) constructing_graph.nodes[node]['feat']['currDegree'] = 0 constructing_graph.nodes[node]['feat']['maxDegree'] = assigned_degree @@ -131,7 +127,6 @@ def update_degrees(graph: nx.DiGraph): if 'feat' not in graph.nodes[node]: graph.nodes[node]['feat'] = {'id': node} graph.nodes[node]['feat']['currDegree'] = 0 - graph.nodes[node]['feat']['maxDegree'] = assigned_degree else: graph.nodes[node]['feat']['currDegree'] = graph.degree(node) \ No newline at end of file diff --git a/GraphGeneration/utils/orca/orca.exe b/GraphGeneration/utils/orca/orca.exe index 18597d8..21192bd 100644 Binary files a/GraphGeneration/utils/orca/orca.exe and b/GraphGeneration/utils/orca/orca.exe differ diff --git a/graph_analysis/analyze_snapshot.py b/graph_analysis/analyze_snapshot.py index 5e94817..0436ee0 100644 --- a/graph_analysis/analyze_snapshot.py +++ b/graph_analysis/analyze_snapshot.py @@ -2,63 +2,48 @@ import sys import numpy as np +import yaml sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../'))) from utils.loader import Loader import argparse from GraphGeneration.scripts.load_data import load_data import matplotlib.pyplot as plt - -# Process arguments -parser = argparse.ArgumentParser() -parser.add_argument("--dataset", type=str, required=False, default='CollegeMsg', choices=['CollegeMsg', 'mathoverflow', 'networkadex', 'networkaeternity', 'networkaion', 'networkaragon', 'networkbancor', 'networkcentra', 'networkcoindash', 'Reddit_B', 'networkcindicator', 'networkiconomi', 'networkdgd']) -parser.add_argument("--strategy", type=str, required=False, default='MultiheadedMLP', choices=['MultiheadedMLP', 'SingleMLP', 'Multiheaded_LSTM_oo'], help="The type of MLP NN to use") -parser.add_argument("--embedding", type=str, required=False, default='Position', choices=['Position', 'NodeType', 'Position+NodeType', 'None'], help="Allows appending positional encodings or an integer node type onto the end of the embeddings") -parser.add_argument("--mlpEncoding", type=str, required=False, default='Concat', choices=['Concat', 'Product', 'Addition', 'Subtraction'], help="How you want to input node embeddings to the MLP") # Product and addition lead to potential noise as we use directed graphs -parser.add_argument("--embedOld", type=str, required=False, default='True', choices=['True', 'False'], help="If you want to let the MLP predict edge type \'o-o-bank\', otherwise these edges are randomly added") -parser.add_argument("--oldDegree", type=str, required=False, default='False' ,choices=['True', 'False'], help="If you want reappearing nodes to reuse their most recent degree") -parser.add_argument("--trainingStyle", type=str, required=False, default='TrueGraphs', choices=['TrueGraphs', 'PredGraphs', 'MixedGraphs'], help="When training the MLP, decides if you use real graphs, predicted graphs (with first real as starter), or real then pred for MLP training") -parser.add_argument("--embeddingType", type=str, required=False, default='Node2Vec', choices=['Linear', 'Node2Vec', 'LSTM'], help="How nodes should be embedded. Either with Node2Vec or with a Linear mutliplication of adjacency matrix by node feature matrix") -parser.add_argument("--snapshot", type=int, required=False, default=2) -args = parser.parse_args() - -my_loader = Loader() -probabilities, features, thresholds, target_graphs = load_data(args.dataset, args.strategy, args.embedding, args.mlpEncoding, args.embedOld, args.trainingStyle, args.embeddingType) -print(probabilities) -def visualize_edge_type_counts(dataset, max_snapshot=30): - # Limit to available snapshots - max_snapshot = min(max_snapshot, len(probabilities)) - - # Create the sliced dataframe - df = { - 'snapshot': [i for i in range(max_snapshot)], - 'OO-bank edges': [probabilities[i][2] for i in range(max_snapshot)], - 'OO-nobank edges': [probabilities[i][5] for i in range(max_snapshot)], - 'ON edges': [probabilities[i][4] for i in range(max_snapshot)], - 'NN edges': [probabilities[i][3] for i in range(max_snapshot)], - } - - snapshots = df['snapshot'] - width = 0.2 # width of each bar - x = np.arange(len(snapshots)) # base positions for each snapshot - - # Plot - plt.figure(figsize=(10, 6)) - plt.bar(x - 1.5 * width, df['OO-bank edges'], width=width, label='OO-bank', color='#4CAF50') - plt.bar(x - 0.5 * width, df['OO-nobank edges'], width=width, label='OO-nobank', color='#2196F3') - plt.bar(x + 0.5 * width, df['ON edges'], width=width, label='ON', color='#FFC107') - plt.bar(x + 1.5 * width, df['NN edges'], width=width, label='NN', color='#F44336') - - # Labels and title - plt.xlabel("Snapshot") - plt.ylabel("Edge Count") - plt.title(f"{dataset} Edge Type Counts per Snapshot") - plt.xticks(x, snapshots) - plt.grid(axis='y', linestyle='--', alpha=0.5) - plt.legend() - - plt.tight_layout() - plt.show() - - - -visualize_edge_type_counts(args.dataset) \ No newline at end of file +from GraphGeneration.scripts.process_data import modifyGraphIds, build_edgebanks_from_start + +# Load YAML config +with open("GraphGeneration/encoder.yaml", "r") as file: + encoder_config = yaml.safe_load(file) + print(encoder_config) + +# Load all the snapshot true data +probabilities, graph_descriptions, thresholds, target_graphs = load_data(encoder_config["dataset"], encoder_config["encoder_model"]["addOnFeature"], + encoder_config["decoder_model"]["encode_links"], encoder_config["encoder_model"]["nodeEmbeddingType"]) + +# Modify the graph ids to 1,2,3,... +target_graphs, _ = modifyGraphIds(target_graphs, thresholds) + +# Build the edgebanks for construction +all_edgebanks = build_edgebanks_from_start(target_graphs) +print(all_edgebanks[1]) +for snapshot in range(len(target_graphs)): + count = 0 + # Get the last graphs from up to 5 previous snapshots + prev_graphs = [graph[-1] for graph in target_graphs[max(snapshot - 5, 0): snapshot]] + prev_edges = set().union(*(graph.edges() for graph in prev_graphs)) + + # Count overlapping edges + for edge in target_graphs[snapshot][-1].edges(): + if edge in prev_edges: + count += 1 + + # Get total edgebank size for current snapshot + total_edgebank = sum(len(all_edgebanks[snapshot][u]) for u in all_edgebanks[snapshot]) + + # Write the result + with open("graph_analysis/analyze_oobank_count.txt", "a") as f: + f.write( + f"Snapshot: {snapshot}, " + f"EdgeBank: {total_edgebank}, " + f"#oobank_probs: {probabilities[snapshot][2]}, " + f"#oobank_re_cal: {count}\n" + ) \ No newline at end of file diff --git a/utils/loader.py b/utils/loader.py index 3f9393b..cf7c044 100644 --- a/utils/loader.py +++ b/utils/loader.py @@ -280,8 +280,7 @@ def to_cached(self): # Betweenness takes too long to process and are deemed not feasible activations = [EmbedDegree, EmbedForman, EmbedWeight, EmbedBetweenness, EmbedIncrementalCloseness] # All activation functions to use activation_names = ['Degree', 'Forman', 'Weight', 'Betweenness', 'Closeness'] - activations = [EmbedDegree, EmbedForman, EmbedWeight, EmbedIncrementalCloseness] # All activation functions to use - activation_names = ['Degree', 'Forman', 'Weight', 'Closeness'] + # If you want to use Betweenness, just run it here # activations = [EmbedBetweenness] # activation_names = ['Betweenness'] @@ -367,7 +366,7 @@ def to_cached(self): start_time = time.time() # Since Forman Ricci requires directed edges - if activation==EmbedForman: + if isinstance(activation, EmbedForman): embeddings, subgraphs, thresholds = my_activation.process_graphs_for_embeddings(graphs, is_directed=True) else: embeddings, subgraphs, thresholds = my_activation.process_graphs_for_embeddings(graphs)