Extending Variability Pattern Mining Scripts.

chtinnes · chtinnes · commit c8564e8b44fe · 2022-04-18T22:07:43.000+02:00
diff --git a/mining/compute_components.py b/mining/compute_components.py
@@ -36,30 +36,30 @@ def get_graph_components(graphs, filtered=False, filter_config=None):
 
         nb_of_components_per_graph.append(len(new_components))
         components += new_components
-    filtered_total = 0
     if filtered:
-        components, filtered_total = filter_too_large(*filter_too_many_similar_nodes(components, 0, filter_config.filter_too_many_similar_max_similar, filter_config.filter_too_many_similar_max_nodes), filter_config.filter_too_large_nb_nodes, filter_config.filter_too_large_nb_edges)
         
-    print("We have %d connected components in %d graphs. From these components %d have beend filtered." % (len(components) + filtered_total, len(graphs), filtered_total))
+        components, filtered = filter_too_large(*filter_too_many_similar_nodes(components, {}, filter_config.filter_too_many_similar_max_similar, filter_config.filter_too_many_similar_max_nodes), filter_config.filter_too_large_nb_nodes, filter_config.filter_too_large_nb_edges)
         
-    return components, nb_of_components_per_graph, filtered_total
+    #print("We have %d connected components in %d graphs. From these components %d have beend filtered." % (len(components) + sum(filtered.values()), len(graphs), filtered_total))
+        
+    return components, nb_of_components_per_graph, filtered
 
 # Filters components with more than nb_nodes/nb_edges nodes/edges. Use -1 for infinity.
-def filter_too_large(components: list, filtered_total: int, nb_nodes=18, nb_edges=40):
+def filter_too_large(components: list, filtered: dict, nb_nodes=18, nb_edges=40):
     new_components = []
 
     for component in components:
         if not (nb_nodes != -1 and (component.number_of_nodes() > nb_nodes or component.number_of_edges() > nb_edges)):
             new_components.append(component)
     
     
-    filtered_total += len(components)-len(new_components)
-    print("Filtered out %d components that are too large, i.e., more than %d nodes or %d edges" % (filtered_total, nb_nodes, nb_edges))
-    return new_components, filtered_total
+    filtered["too_large"] = len(components)-len(new_components)
+    #print("Filtered out %d components that are too large, i.e., more than %d nodes or %d edges" % (filtered["too_large"], nb_nodes, nb_edges))
+    return new_components, filtered
 
 
 # Several filters need to be applied to filter out components which could lead to too high computational efforts
-def filter_too_many_similar_nodes(components: list, filtered_total: int, max_similar=2, max_nodes=10):
+def filter_too_many_similar_nodes(components: list, filtered: dict, max_similar=2, max_nodes=10):
     new_components = []
 
     for component in components:
@@ -68,9 +68,9 @@ def filter_too_many_similar_nodes(components: list, filtered_total: int, max_sim
         if not (np.sum(np.array(list(labels.values())) > max_nodes) > max_similar):
             new_components.append(component)
     
-    filtered_total += len(components)-len(new_components)
-    print("Filtered out %d components with too many similar nodes, i.e., more than %d labels appeared more than %d times" % (filtered_total, max_similar, max_nodes))
-    return new_components, filtered_total
+    filtered["too_many_similar"] = len(components)-len(new_components)
+    #print("Filtered out %d components with too many similar nodes, i.e., more than %d labels appeared more than %d times" % (filtered["too_many_similar"] , max_similar, max_nodes))
+    return new_components, filtered
 
 
 def label_count_for_component(component):
@@ -97,9 +97,9 @@ def get_components(input_folder, formatting=INPUT_FORMAT_NX, filtered=False):
     return components, nb_of_components_per_diff, filtered_total
 
     
-def main(input_folder, output_folder, formatting=INPUT_FORMAT_NX, max_components_per_file=200):
+def main(input_folder, output_folder, dataset_name, max_components_per_file=200, formatting=INPUT_FORMAT_NX):
     # TODO doing this with streams and yield would be a nicer solution to the chunking.
-    components, nb_of_components_per_diff, filtered_total = get_components(input_folder, formatting=formatting, filtered=True)
+    components, nb_of_components_per_diff, filtered = get_components(input_folder, formatting=formatting, filtered=True)
     
     components_batched = [components[i*max_components_per_file:min((i+1)*max_components_per_file, len(components))] for i in range(ceil(len(components)/max_components_per_file))]
     
@@ -117,12 +117,14 @@ def main(input_folder, output_folder, formatting=INPUT_FORMAT_NX, max_components
         #export_subdue_python_json(batch, set_name + '/connected_components.json')
 
     with open(output_folder + '/filter_stats.csv', 'w') as f:
-        f.write(str(len(components)) + "," + str(filtered_total))
+        # ds name, all components, components after filtering, filtered too large, filtered too many similar
+        
+        f.write(dataset_name + "," + str(len(components) + filtered["too_large"] + filtered["too_many_similar"]) + "," + str(len(components)) + "," + str(filtered["too_large"])+ "," + str(filtered["too_many_similar"]))
 
 if __name__ == "__main__":
-    if len(sys.argv) == 3:
-        main(sys.argv[1], sys.argv[2])
-    elif len(sys.argv) == 4:
-        main(sys.argv[1], sys.argv[2], formatting = sys.argv[3])	
+    if len(sys.argv) == 5:
+        main(sys.argv[1], sys.argv[2], sys.argv[3], int(sys.argv[4]))
+    elif len(sys.argv) == 6:
+        main(sys.argv[1], sys.argv[2], sys.argv[3], int(sys.argv[4]), formatting = sys.argv[5])	
     else:
-        print("Unexpected number of arguments. At least input path and output path has to be provided. Optionally as a third argument put NX or LG indicating the input graph formatting.")
+        print("Unexpected number of arguments. At least input path, output path, dataset name, as well as batch_size has to be provided. Optionally as a fourth argument put NX or LG indicating the input graph formatting.")
diff --git a/mining/pipeline.sh b/mining/pipeline.sh
@@ -1,27 +1,67 @@
 #!/bin/bash
 
+# call as pipline.sh [input-path] [output-path] [lib-path]
+# e.g. ./pipeline.sh '/home/schwinnez/projects/Frequent Subgraph Mining/case studies/variability_patterns/dataset_1/diffgraphs/' '/home/schwinnez/projects/Frequent Subgraph Mining/case studies/variability_patterns/dataset_1/output/' './lib/'
+
+
+
+
+
+python_version=$(python3 --version)
+echo "Using python version $python_version"
+# TODO automatically make sure that python 3 is used
 
-# Definition of output directories
-output_filtered="$2filtered/"
-output_mining="$2mining/"
-output_mining_no_duplicates="$2mining_no_duplicates/"
-lib_path="$3"
-parsemis_path="$3parsemis.jar"
 target_subtree_count_for_threshold_estimation=300
 threshold=10 # 0 means read threshold from files
+batch_size=200
 min_size=4
 max_size=15
 
-mkdir $2
 
-# Step 1 - Read graph databases, filter and chunk - Default filter config: Not larger than 15 nodes, 30 edges, no more than
-python compute_components.py $1 $output_filtered LG
+echo "Input:  $1"
+echo "Output:  $2"
+echo "Libs:  $3"
+
+mkdir -p "$2"
+
+
+run_dataset(){
+	echo "Input:  $1"
+	echo "Output:  $2"
+	echo "Libs:  $3"
+
+	# Definition of output directories
+	data_set_name="$4"
+	input_path="$1"
+	output_filtered="$2filtered/"
+	output_mining="$2mining/"
+	output_mining_no_duplicates="$2mining_no_duplicates/"
+	lib_path="$3"
+	parsemis_path="$3parsemis.jar"
+
+	echo "Running dataset: $data_set_name"
+	mkdir -p "$2"
+
+
+	# Step 1 - Read graph databases, filter and chunk - Default filter config: Not larger than 15 nodes, 30 edges, no more than
+	python3 compute_components.py "$input_path" "$output_filtered" "$data_set_name" $batch_size LG
+
+	# Step 2 - Compute thresholds - Not better than fixed threshold for Linux dataset
+	#python bisect_threshold_search.py $lib_path $output_filtered $target_subtree_count_for_threshold_estimation
+
+	# Step 3 - Mining
+	python3 run_parsemis.py "$parsemis_path" "$output_filtered" "$output_mining" $threshold $min_size $max_size
+
+	# Step 4 - Remove duplicates
+	python3 remove_duplicates.py "$output_mining" "$output_mining_no_duplicates" "$data_set_name" 
+}
+
+# Run for every dataset in input folder
+for input_folder in "$1"/*/ ; do
+	dataset="$(basename "$input_folder")"
+	output_base="$2/$batch_size-$threshold/$dataset/"
+	run_dataset "$input_folder" "$output_base" "$3" "$dataset"
+done
 
-# Step 2 - Compute thresholds - Not better than fixed threshold for Linux dataset
-#python bisect_threshold_search.py $lib_path $output_filtered $target_subtree_count_for_threshold_estimation
 
-# Step 3 - Mining
-python run_parsemis.py $parsemis_path $output_filtered $output_mining $threshold $min_size $max_size
 
-# Step 4 - Remove duplicates
-python remove_duplicates.py output_mining output_mining_no_duplicates
diff --git a/mining/remove_duplicates.py b/mining/remove_duplicates.py
@@ -3,7 +3,7 @@
 import sys
 import os
 
-def main(subgraphs_path, results_dir):
+def main(subgraphs_path, results_dir, dataset):
     # Import graphs
     print("Parsing graph input files")
     subgraphs = import_tlv_folder(subgraphs_path, parse_support=False)
@@ -21,13 +21,13 @@ def main(subgraphs_path, results_dir):
     print("Writing subgraphs.")
     export_TLV(subgraphs, results_dir + './results_no_duplicates.lg')
     with open(results_dir + './results.csv', 'w') as f:
-        f.write(str(nb_initial_subgraphs) +"," + str(nb_pruned_subgraphs) + "," + str(removed_duplicates))
+        f.write(dataset + "," + str(nb_initial_subgraphs) +"," + str(nb_pruned_subgraphs) + "," + str(removed_duplicates))
         
 if __name__ == "__main__":
-    if len(sys.argv) < 3:
-        print("Two arguments expected: path to input graph database, path to output results. Run as python remove_duplicates.py [imput_folder] [output_folder]")
+    if len(sys.argv) < 4:
+        print("Three arguments expected: path to input graph database, path to output results. Run as python remove_duplicates.py [input_folder] [output_folder] [dataset_name]")
     
     # Create output folder if it doesn't exist yet
     os.makedirs(sys.argv[2], exist_ok=True)
     
-    main(sys.argv[1], sys.argv[2])
+    main(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/mining/run_parsemis.py b/mining/run_parsemis.py
@@ -10,22 +10,23 @@
 
 #### helper to compute memory for mining from machine specs ####
 def get_available_memory(fraction=0.8):
-
     with open('/proc/meminfo') as f:
         meminfo = f.read()
     matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
     if matched: 
         mem_total_gig = int(matched.groups()[0])/(1024*1024)
         
     available_mem = math.ceil(mem_total_gig * fraction)
+    return available_mem
 
 ########################### Parsemis #####################################################
 def run_parsemis(lib_path, in_folder, output_folder, threshold, min_size, max_size, timeout_seconds=180):
     memory = str(get_available_memory()) + 'G'
     nb_threads = os.cpu_count()
     # Template for shell command shell command
-    parsemis_cmd_template = "java -Xmx{memory} -jar  '{parsemis_path}' --graphFile='{in_file}' --outputFile='{out_file}' --minimumFrequency={threshold} --maximumNodeCount={max_size} --minimumNodeCount={min_size} --algorithm=dagma --singleRooted=true --closeGraph=true --zaretsky=true --subdue=true --storeEmbeddings=true --distribution=threads --threads={nb_threads} --swapFile={swap_file}" 
+    parsemis_cmd_template = "java -Xmx{memory} -jar  '{parsemis_path}' --graphFile='{in_file}' --outputFile='{out_file}' --minimumFrequency={threshold} --maximumNodeCount={max_size} --minimumNodeCount={min_size} --algorithm=dagma --singleRooted=true --closeGraph=true --zaretsky=true --subdue=true --storeEmbeddings=true --distribution=threads --threads={nb_threads} --swapFile='{swap_file}'" 
     
+    print(f'Running Frequent Subgraph Mining for input folder: {in_folder}')
     # Currently only support for line graph
     for idx, in_file in enumerate([file_name for file_name in os.listdir(in_folder) if file_name.endswith('.lg')]):
         match_id = re.match(regex_file_id, in_file)
@@ -44,6 +45,8 @@ def run_parsemis(lib_path, in_folder, output_folder, threshold, min_size, max_si
         parsemis_cmd = parsemis_cmd_template.format(parsemis_path=lib_path, in_file=in_folder + in_file, out_file=output_folder+'/results_' + str(database_id) + '.lg', swap_file=output_folder+'\swap.tmp', threshold=threshold, max_size=max_size, min_size=min_size, memory=memory, nb_threads=nb_threads )
         # Run command (REQUIRES JAVA 8!!!)
         
+        print(f'Running Frequent Subgraph Mining. Command: {parsemis_cmd}')
+        
         try:
             p = subprocess.Popen(parsemis_cmd, shell=True, start_new_session=True)
             error_code = p.wait(timeout=timeout_seconds)
@@ -71,5 +74,5 @@ def run_parsemis(lib_path, in_folder, output_folder, threshold, min_size, max_si
     # Create output folder if it doesn't exist yet
     os.makedirs(sys.argv[3], exist_ok=True)
     
-    
+
     run_parsemis(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6])
diff --git a/mining/subgraph_statistics.py b/mining/subgraph_statistics.py
@@ -251,6 +251,7 @@ def compute_occurrences_lattice_based(self):
         # Store number of occurrences here
         for lattice_node in lattice_nodes:
             self.occurrences_transaction[lattice_node.graph.name] = len(lattice_node.occurrences)
+            print(lattice_node.occurrences)
   
     def write_as_csv(self, save_path):
         with open(save_path, 'w', newline='') as csvfile: