Skip to content

Commit 3aa7ac4

Browse files
committed
Mining Pipeline: Supporting multiple datasets for stats.
1 parent c8564e8 commit 3aa7ac4

2 files changed

Lines changed: 45 additions & 30 deletions

File tree

mining/plot_dot_from_graph_ml.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#!/bin/bash
2-
input_file = $1
3-
output_file = $2
2+
input_file = "$1"
3+
output_file = "$2"
44

5-
graphml2gv $1 -o temp.dot
6-
dot -Tpng temp.dot -o $2
5+
graphml2gv "$1" -o temp.dot
6+
dot -Tpng temp.dot -o "$2"
77

88
#Clean-Up
9-
rm temp.dot
9+
rm temp.dot

mining/subgraph_statistics.py

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@ class LatticeNode():
2121
Furthermore, the lattice nodes have pointers to graphs in a graph_database where they are contained in (i.e., their occurrences).
2222
'''
2323

24-
def __init__(self, graph: IsoGraph):
24+
def __init__(self, graph: IsoGraph, label_name = 'label'):
2525
self.graph = graph
26+
self.graph.set_label(label_name)
27+
2628
self.occurrences = []
2729
self.parents = []
2830
self.children = []
@@ -200,16 +202,17 @@ class Statistics():
200202
'''
201203
Provides basic counting statistics for a set of subgraphs and a graph database.
202204
'''
203-
def __init__(self, graph_db: List[nx.DiGraph], subgraphs: List[nx.DiGraph], label_name = 'label', override_names=True):
205+
def __init__(self, graph_db: List[nx.DiGraph], subgraphs: List[nx.DiGraph], label_name = 'label', override_names=False, lattice: Lattice=None):
204206
self.graph_db = [IsoGraph(graph) for graph in graph_db]
205207
self.subgraphs = [IsoGraph(graph) for graph in subgraphs]
206208
self.label_name = label_name
207209
self._set_label_name()
208210
self.override_names = override_names
211+
# Rename subgraphs if necessary
209212
self._name_subgraphs()
210213
self.occurrences_transaction = {}
211214
self.occurrences_embeddings = {}
212-
self.lattice = None
215+
self.lattice = lattice
213216

214217
def _set_label_name(self):
215218
for graph in self.graph_db:
@@ -242,10 +245,11 @@ def compute_occurrences_brute_force(self):
242245
self.occurrences_transaction[subgraph.name] = occurrences
243246

244247
def compute_occurrences_lattice_based(self):
245-
# First create the lattice node for the subgraphs
246-
lattice_nodes = [LatticeNode(subgraph) for subgraph in self.subgraphs]
247-
# Create lattice (this might take some time)
248-
self.lattice = Lattice(lattice_nodes)
248+
if self.lattice is None:
249+
# First create the lattice node for the subgraphs
250+
lattice_nodes = [LatticeNode(subgraph) for subgraph in self.subgraphs]
251+
# Create lattice (this might take some time)
252+
self.lattice = Lattice(lattice_nodes)
249253
# Execute occurrence computation
250254
self.lattice.count_occurrences(self.graph_db)
251255
# Store number of occurrences here
@@ -263,17 +267,14 @@ def write_as_csv(self, save_path):
263267
csvwriter.writerow([subgraph.name, occurrences_embeddings, occurrences_transaction])
264268

265269
def main(graph_db_path: str, subgraphs_path:str, results_dir:str):
266-
# Read db
267-
print("Parsing graph input files")
268-
graph_db = import_tlv_folder(graph_db_path, parse_support=False)
269270
subgraphs = import_tlv_folder(subgraphs_path, parse_support=False)
270271

271272
#TODO REMOVE THIS AGAIN, THIS IS ONLY TO FIT TO THE TEST DATA
272273
#subgraphs = [graph.reverse() for graph in subgraphs]
273274

274275
#TODO Workaround since a dummy root has been added by a previous steps
275276
#subgraphs = [IsoGraph(graph).cut_root() for graph in subgraphs]
276-
277+
277278
# Get rid of clones
278279
nb_initial_subgraphs = len(subgraphs)
279280
print("Removing duplicates. This might take some time...")
@@ -282,9 +283,34 @@ def main(graph_db_path: str, subgraphs_path:str, results_dir:str):
282283
removed_duplicates = nb_initial_subgraphs - nb_pruned_subgraphs
283284
print("Removed %d duplicates" % removed_duplicates)
284285

286+
print("Creating subgraph lattice for lattice-based counting...")
287+
# First create the lattice node for the subgraphs
288+
lattice_nodes = [LatticeNode(subgraph) for subgraph in subgraphs]
289+
# Create lattice (this might take some time)
290+
lattice = Lattice(lattice_nodes)
291+
292+
print("Exporting lattice.")
293+
nx_lattice = lattice.to_networkx()
294+
export_TLV([nx_lattice], results_dir + 'lattice.lg')
295+
#plot_graphs([nx_lattice], results_dir + 'lattice.png')
296+
#plot_graph_dot(nx_lattice, results_dir + 'lattice_dot.png')
297+
with open(results_dir + 'lattice.graphml', 'w') as f:
298+
f.write(lattice.to_graphml())
299+
300+
# Write subgraphs without clones
301+
print("Writing subgraphs without occurrences.")
302+
export_TLV(subgraphs, results_dir + 'subgraph_candidates.lg')
303+
304+
for folder in os.listdir(graph_db_path):
305+
# Read db
306+
print(f"Parsing graph database for data set {folder}")
307+
graph_db = import_tlv_folder(graph_db_path+"/"+folder+"/", parse_support=False)
308+
compute_statistics(graph_db, subgraphs, lattice, results_dir)
309+
310+
def compute_statistics(graph_db, subgraphs, lattice, results_dir):
285311
# Compute statistics
286312
print("Counting the subgraph occurrences in the graph database. This might take some time...")
287-
stats = Statistics(graph_db, subgraphs)
313+
stats = Statistics(graph_db, subgraphs, lattice=lattice)
288314
#start = time.time()
289315
#stats.compute_occurrences_brute_force()
290316
#print(stats.occurrences_transaction)
@@ -295,24 +321,13 @@ def main(graph_db_path: str, subgraphs_path:str, results_dir:str):
295321
stats.compute_occurrences_lattice_based()
296322
stop = time.time()
297323
print("Computing occurrences lattice based took %f seconds." % (stop-start))
298-
if stats.lattice is not None:
299-
print("Exporting lattice.")
300-
nx_lattice = stats.lattice.to_networkx()
301-
export_TLV([nx_lattice], results_dir + 'lattice.lg')
302-
#plot_graphs([nx_lattice], results_dir + 'lattice.png')
303-
#plot_graph_dot(nx_lattice, results_dir + 'lattice_dot.png')
304-
with open(results_dir + 'lattice.graphml', 'w') as f:
305-
f.write(stats.lattice.to_graphml())
306-
307-
# Write subgraphs without clones
308-
print("Writing subgraphs without occurrences.")
309-
export_TLV(subgraphs, results_dir + 'subgraph_candidates.lg')
310324

311325
# Write statistics to file
312326
print("Write occurrence statistics...")
313327
stats.write_as_csv(results_dir + 'occurrence_stats.csv')
314328
print("Done")
315-
329+
330+
316331
if __name__ == "__main__":
317332
if len(sys.argv) < 4:
318333
print("Three arguments expected: path to graph database folder, path to subgraph database folder, path to results directory")

0 commit comments

Comments
 (0)