Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions GraphGeneration/encoder.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
encoder_model:
name: topoGED
node2vec_setup:
node2vec_dimensions: 64 # We add features onto the end since Node2Vec doesn't embed features
node2vec_walk_length: 50 # Number of nodes visited per walk (Higher is more global, smaller is local)
node2vec_num_walks: 10 # Number of walks to start per node (Higher is more detailed and stable)
node2vec_p: 1.0 # Return parameter, the likelihood of revisiting a node (Higher is less backtracking)
node2vec_q: 1.0 # The walk bias for determining direction (Higher is more DFS-like; lower is BFS-like)
node2vec_window: 10 # The context size (Higher is broader learning)
node2vec_min_count: 1 # Minimum number of occurrences for a node to be considered (Higher will ignore more rare nodes)
node2vec_batch_words: 4 # The batch size for when Word2Vec is used (Higher will train faster; but with more memory)
node2vec_workers: 1 # Number of workers (threads)
hidden_dim: 64
nodeEmbeddingType: "LSTM"
addOnFeature: "Position"

decoder_model:
encode_links: "Concat"

training:
batch_size: 64 # play around
lr: 0.001
epochs: 500

dataset: CollegeMsg
seed: 1024
28 changes: 5 additions & 23 deletions GraphGeneration/models/model.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
# Models in use
import os
import torch
from GraphGeneration.models.MultiHeadedEdgePredictor import MultiHeadedEdgePredictor
from GraphGeneration.models.EdgePredictor import EdgePredictorMLP
from GraphGeneration.models.temporal_gnn.script.utils.util import logger
from GraphGeneration.models.temporal_gnn.script.models.HTGN import HTGN
from GraphGeneration.models.GCLSTM import GCLSTM
from GraphGeneration.models.SimpleNodeLSTM import SimpleNodeLSTM

def setupMLP(embedding_dim, embedding, mlpEncoding, embedOld):
def setupMLP(embedding_dim, mlpEncoding):
"""
Set up the MLP based on the arguments provided in the command line starter

Expand All @@ -21,28 +16,15 @@ def setupMLP(embedding_dim, embedding, mlpEncoding, embedOld):
input_dim = embedding_dim # Starting input dimension (two 32-dim node embeddings)

# Set up the MLPs according to arguments
if embedOld == 'True':
flags = ['o-o-bank', 'o-o-nobank', 'o-n', 'n-n']
else:
flags = ['o-o-nobank', 'o-n', 'n-n']
flags = ['o-o-bank', 'o-o-nobank', 'o-n', 'n-n']
mlp = MultiHeadedEdgePredictor(in_channels=input_dim, hidden_channels=32, edge_types=flags, input_type=mlpEncoding)

return mlp

def load_encoder_model(args, device, node2vec_dimensions, hidden_dim=64, HTGN_nodelist=[]):
if args.embeddingType == 'LSTM':
def load_encoder_model(encoder_config, device, node2vec_dimensions, hidden_dim=64, HTGN_nodelist=[]):
if encoder_config["encoder_model"]["nodeEmbeddingType"] == 'LSTM':
model = SimpleNodeLSTM(input_dim=node2vec_dimensions, hidden_dim=hidden_dim).to(device)
elif args.embeddingType == 'GCLSTM':
model = GCLSTM(in_channels=node2vec_dimensions, hidden_channels=64).to(device)
model.device = device
elif args.embeddingType == 'HTGN':
args.num_nodes = len(HTGN_nodelist)
args.nfeat = node2vec_dimensions
args.nhid = 64
args.nout = 64
model = HTGN(args).to(device)
model.device = device
else:
raise Exception('pls define the model')
logger.info('using model {} '.format(args.embeddingType))
logger.info('using model {} '.format(encoder_config["encoder_model"]["nodeEmbeddingType"]))
return model, hidden_dim
8 changes: 0 additions & 8 deletions GraphGeneration/models/temporal_gnn/script/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,6 @@
parser.add_argument('--fixed_curvature', type=int, default=1, help='fixed (1) curvature or not (0)')
parser.add_argument('--aggregation', type=str, default='deg', help='aggregation method: [deg, att]')

parser.add_argument("--strategy", type=str, required=False, default='MultiheadedMLP', choices=['MultiheadedMLP', 'SingleMLP'], help="The type of MLP NN to use")
parser.add_argument("--embedding", type=str, required=False, default='Position', choices=['Position', 'NodeType', 'Position+NodeType', 'None'], help="Allows appending positional encodings or an integer node type onto the end of the embeddings")
parser.add_argument("--mlpEncoding", type=str, required=False, default='Concat', choices=['Concat', 'Product', 'Addition', 'Subtraction'], help="How you want to input node embeddings to the MLP") # Product and addition lead to potential noise as we use directed graphs
parser.add_argument("--embedOld", type=str, required=False, default='True', choices=['True', 'False'], help="If you want to let the MLP predict edge type \'o-o-bank\', otherwise these edges are randomly added")
parser.add_argument("--oldDegree", type=str, required=False, default='False' ,choices=['True', 'False'], help="If you want reappearing nodes to reuse their most recent degree")
parser.add_argument("--trainingStyle", type=str, required=False, default='TrueGraphs', choices=['TrueGraphs', 'PredGraphs', 'MixedGraphs'], help="When training the MLP, decides if you use real graphs, predicted graphs (with first real as starter), or real then pred for MLP training")
parser.add_argument("--embeddingType", type=str, required=False, default='Node2Vec', choices=['Linear', 'Node2Vec', 'LSTM', 'GCLSTM', 'HTGN'], help="How nodes should be embedded. Either with Node2Vec or with a Linear mutliplication of adjacency matrix by node feature matrix")

# TopoGED mode
parser.add_argument('--use_predict_probs', action='store_true', help='Use prediction probabilities to predict the next snapshot')
parser.add_argument('--use_predict_graph_prediction', action='store_true', help='Use prediction graph description to predict the next snapshot')
Expand Down
38 changes: 17 additions & 21 deletions GraphGeneration/scripts/compute_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,11 @@
import torch
from node2vec import Node2Vec
from GraphGeneration.models.temporal_gnn.script.config import args
import yaml

# Node2Vec Parameters
node2vec_dimensions = args.nfeat # We add features onto the end since Node2Vec doesn't embed features
node2vec_walk_length = 50 # Number of nodes visited per walk (Higher is more global, smaller is local)
node2vec_num_walks = 10 # Number of walks to start per node (Higher is more detailed and stable)
node2vec_p = 1.0 # Return parameter, the likelihood of revisiting a node (Higher is less backtracking)
node2vec_q = 1.0 # The walk bias for determining direction (Higher is more DFS-like; lower is BFS-like)
node2vec_window = 10 # The context size (Higher is broader learning)
node2vec_min_count = 1 # Minimum number of occurrences for a node to be considered (Higher will ignore more rare nodes)
node2vec_batch_words = 4 # The batch size for when Word2Vec is used (Higher will train faster; but with more memory)
node2vec_workers = 1 # Number of workers (threads)
# Load YAML config
with open("GraphGeneration/encoder.yaml", "r") as file:
encoder_config = yaml.safe_load(file)

def compute_linear_gnn_embeddings(G: nx.DiGraph, device):
"""
Expand Down Expand Up @@ -62,19 +56,20 @@ def compute_node2vec_embeddings(G: nx.DiGraph, device):
"""
node2vec = Node2Vec(
G,
dimensions=node2vec_dimensions,
walk_length=node2vec_walk_length,
num_walks=node2vec_num_walks,
workers=node2vec_workers,
p=node2vec_p,
q=node2vec_q,
dimensions=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"],
walk_length=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_walk_length"],
num_walks=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_num_walks"],
workers=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_workers"],
p=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_p"],
q=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_q"],
quiet=True
)

model = node2vec.fit(
window=node2vec_window,
min_count=node2vec_min_count,
batch_words=node2vec_batch_words
window=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_window"],
min_count=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_min_count"],
batch_words=encoder_config["encoder_model"]["node2vec_setup"]["node2vec_batch_words"],
workers=1
) # Perform Node2Vec

# Used to generate an embedding for isolated nodes
Expand Down Expand Up @@ -111,7 +106,8 @@ def compute_node_embeddings_LSTM(graph_snapshots, lstm_model, device):
# Collect per-timestep node embeddings
node_history = defaultdict(list)
old_nodes = set()
null_embed = torch.tensor([0]*(node2vec_dimensions), dtype=torch.float32).to(device)
null_embed = torch.tensor([0]*(encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"]),
dtype=torch.float32).to(device)
for G in graph_snapshots:
snapshot_embeddings = compute_node2vec_embeddings(G, device)
for node, emb in snapshot_embeddings.items():
Expand Down Expand Up @@ -151,7 +147,7 @@ def get_GCN_data(graph_snapshots):
x_list = []
edge_index_list = []

F = node2vec_dimensions # number of features per node (change this if you want more features)
F = encoder_config["encoder_model"]["node2vec_setup"]["node2vec_dimensions"] # number of features per node (change this if you want more features)

for G in graph_snapshots:
node2vec_embeddings = compute_node2vec_embeddings(G)
Expand Down
34 changes: 20 additions & 14 deletions GraphGeneration/scripts/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from utils.loader import Loader


def load_data(dataset, strategy, embedding, mlpEncoding, embedOld, trainingStyle, embeddingType):
def load_data(dataset, embedding, mlpEncoding, embeddingType):
my_loader = Loader()
output_dir = os.path.abspath(f'data/input/cached/{dataset}')

Expand All @@ -19,14 +19,14 @@ def load_data(dataset, strategy, embedding, mlpEncoding, embedOld, trainingStyle
cached_data_dataset_folder = os.path.join(output_dir, 'saved_data/')

# Construct output evaluation csv
structure_pred_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/structure_pred.csv'
structure_true_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/structure_true.csv'
structure_diff_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/structure_diff.csv'
kernel_pred_file_path = f'GraphGeneration/output/results/kernel/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/kernel_pred.csv'
kernel_true_file_path = f'GraphGeneration/output/results/kernel/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/kernel_true.csv'
edge_file_path = f'GraphGeneration/output/results/structure/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/edge_analysis.csv'
topER_file_path = f'GraphGeneration/output/results/topER/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/toper_diff.csv'
animation_path = f'GraphGeneration/output/results/animations/{dataset}/model_gen_retrain_{strategy}_embedding{embedding}_mlpEncoding{mlpEncoding}_embedOld{embedOld}_trainingStyle{trainingStyle}_embeddingType{embeddingType}/pred_vs_true.mp4'
structure_pred_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/structure_pred.csv'
structure_true_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/structure_true.csv'
structure_diff_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/structure_diff.csv'
kernel_pred_file_path = f'GraphGeneration/output/results/kernel/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/kernel_pred.csv'
kernel_true_file_path = f'GraphGeneration/output/results/kernel/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/kernel_true.csv'
edge_file_path = f'GraphGeneration/output/results/structure/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/edge_analysis.csv'
topER_file_path = f'GraphGeneration/output/results/topER/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/toper_diff.csv'
animation_path = f'GraphGeneration/output/results/animations/{dataset}/topoGED_embedding{embedding}_mlpEncoding{mlpEncoding}_embeddingType{embeddingType}/pred_vs_true.mp4'

# Create file paths if needed
for path in [structure_pred_file_path, structure_true_file_path, structure_diff_file_path, kernel_pred_file_path,
Expand Down Expand Up @@ -282,10 +282,14 @@ def generate_validation_data(training_graphs, old_training_nodes, all_edgebanks,
print(f"[FATAL] Unexpected failure at outer loop for edge ({u}, {v}): {type(e).__name__} - {e}")

# Generate an equal amount of negative labels for each type of edge
negative_edges_oo = generate_negative_edges(graph, new_edges_count['o-o-bank'], edge_type='o-o-bank', edgebank=all_edgebanks, old_nodes=old_training_nodes)
negative_edges_oon = generate_negative_edges(graph, new_edges_count['o-o-nobank'], edge_type='o-o-nobank', edgebank=all_edgebanks, old_nodes=old_training_nodes)
negative_edges_on = generate_negative_edges(graph, new_edges_count['o-n'], edge_type='o-n', edgebank=all_edgebanks, old_nodes=old_training_nodes)
negative_edges_nn = generate_negative_edges(graph, new_edges_count['n-n'], edge_type='n-n', edgebank=all_edgebanks, old_nodes=old_training_nodes)
negative_edges_oo = generate_negative_edges(graph, new_edges_count['o-o-bank'], edge_type='o-o-bank',
edgebank=all_edgebanks, old_nodes=old_training_nodes)
negative_edges_oon = generate_negative_edges(graph, new_edges_count['o-o-nobank'], edge_type='o-o-nobank',
edgebank=all_edgebanks, old_nodes=old_training_nodes)
negative_edges_on = generate_negative_edges(graph, new_edges_count['o-n'], edge_type='o-n',
edgebank=all_edgebanks, old_nodes=old_training_nodes)
negative_edges_nn = generate_negative_edges(graph, new_edges_count['n-n'], edge_type='n-n',
edgebank=all_edgebanks, old_nodes=old_training_nodes)

tmp_samples_oo = [torch.tensor([u, v]) for u, v in negative_edges_oo]
tmp_samples_oon = [torch.tensor([u, v]) for u, v in negative_edges_oon]
Expand All @@ -301,7 +305,9 @@ def generate_validation_data(training_graphs, old_training_nodes, all_edgebanks,
sorted_samples['o-n']['y'][i].extend([0 for _ in range(len(negative_edges_on))])
sorted_samples['n-n']['X'][i].extend(tmp_samples_nn)
sorted_samples['n-n']['y'][i].extend([0 for _ in range(len(negative_edges_nn))])


old_training_nodes.update(graph.nodes()) # Add the old nodes

return sorted_samples, new_edges_count

def generate_validation_data_cached(training_graphs, old_training_nodes, all_edgebanks, MAX_SAMPLES, dataset, seed, type_data, saved_data_file_path):
Expand Down
2 changes: 1 addition & 1 deletion GraphGeneration/scripts/process_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def build_edgebanks_from_start(graphs, days=5):
curr_edgebank = {}

# Add edges from all previous graphs (not the current graph)
for j in range(max(i - days, 0), i): # Loop through all previous graphs (graphs 0 to i-1)
for j in range(max(i - days, 0), i): # Loop through all previous graphs (graphs i - days to i-1)
for u, v in graphs[j][-1].edges(): # Accessing the graph directly
u_key = u
v_key = v
Expand Down
Loading