Skip to content

Commit c8564e8

Browse files
committed
Extending Variability Pattern Mining Scripts.
1 parent 31313cb commit c8564e8

5 files changed

Lines changed: 89 additions & 43 deletions

File tree

mining/compute_components.py

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -36,30 +36,30 @@ def get_graph_components(graphs, filtered=False, filter_config=None):
3636

3737
nb_of_components_per_graph.append(len(new_components))
3838
components += new_components
39-
filtered_total = 0
4039
if filtered:
41-
components, filtered_total = filter_too_large(*filter_too_many_similar_nodes(components, 0, filter_config.filter_too_many_similar_max_similar, filter_config.filter_too_many_similar_max_nodes), filter_config.filter_too_large_nb_nodes, filter_config.filter_too_large_nb_edges)
4240

43-
print("We have %d connected components in %d graphs. From these components %d have beend filtered." % (len(components) + filtered_total, len(graphs), filtered_total))
41+
components, filtered = filter_too_large(*filter_too_many_similar_nodes(components, {}, filter_config.filter_too_many_similar_max_similar, filter_config.filter_too_many_similar_max_nodes), filter_config.filter_too_large_nb_nodes, filter_config.filter_too_large_nb_edges)
4442

45-
return components, nb_of_components_per_graph, filtered_total
43+
#print("We have %d connected components in %d graphs. From these components %d have beend filtered." % (len(components) + sum(filtered.values()), len(graphs), filtered_total))
44+
45+
return components, nb_of_components_per_graph, filtered
4646

4747
# Filters components with more than nb_nodes/nb_edges nodes/edges. Use -1 for infinity.
48-
def filter_too_large(components: list, filtered_total: int, nb_nodes=18, nb_edges=40):
48+
def filter_too_large(components: list, filtered: dict, nb_nodes=18, nb_edges=40):
4949
new_components = []
5050

5151
for component in components:
5252
if not (nb_nodes != -1 and (component.number_of_nodes() > nb_nodes or component.number_of_edges() > nb_edges)):
5353
new_components.append(component)
5454

5555

56-
filtered_total += len(components)-len(new_components)
57-
print("Filtered out %d components that are too large, i.e., more than %d nodes or %d edges" % (filtered_total, nb_nodes, nb_edges))
58-
return new_components, filtered_total
56+
filtered["too_large"] = len(components)-len(new_components)
57+
#print("Filtered out %d components that are too large, i.e., more than %d nodes or %d edges" % (filtered["too_large"], nb_nodes, nb_edges))
58+
return new_components, filtered
5959

6060

6161
# Several filters need to be applied to filter out components which could lead to too high computational efforts
62-
def filter_too_many_similar_nodes(components: list, filtered_total: int, max_similar=2, max_nodes=10):
62+
def filter_too_many_similar_nodes(components: list, filtered: dict, max_similar=2, max_nodes=10):
6363
new_components = []
6464

6565
for component in components:
@@ -68,9 +68,9 @@ def filter_too_many_similar_nodes(components: list, filtered_total: int, max_sim
6868
if not (np.sum(np.array(list(labels.values())) > max_nodes) > max_similar):
6969
new_components.append(component)
7070

71-
filtered_total += len(components)-len(new_components)
72-
print("Filtered out %d components with too many similar nodes, i.e., more than %d labels appeared more than %d times" % (filtered_total, max_similar, max_nodes))
73-
return new_components, filtered_total
71+
filtered["too_many_similar"] = len(components)-len(new_components)
72+
#print("Filtered out %d components with too many similar nodes, i.e., more than %d labels appeared more than %d times" % (filtered["too_many_similar"] , max_similar, max_nodes))
73+
return new_components, filtered
7474

7575

7676
def label_count_for_component(component):
@@ -97,9 +97,9 @@ def get_components(input_folder, formatting=INPUT_FORMAT_NX, filtered=False):
9797
return components, nb_of_components_per_diff, filtered_total
9898

9999

100-
def main(input_folder, output_folder, formatting=INPUT_FORMAT_NX, max_components_per_file=200):
100+
def main(input_folder, output_folder, dataset_name, max_components_per_file=200, formatting=INPUT_FORMAT_NX):
101101
# TODO doing this with streams and yield would be a nicer solution to the chunking.
102-
components, nb_of_components_per_diff, filtered_total = get_components(input_folder, formatting=formatting, filtered=True)
102+
components, nb_of_components_per_diff, filtered = get_components(input_folder, formatting=formatting, filtered=True)
103103

104104
components_batched = [components[i*max_components_per_file:min((i+1)*max_components_per_file, len(components))] for i in range(ceil(len(components)/max_components_per_file))]
105105

@@ -117,12 +117,14 @@ def main(input_folder, output_folder, formatting=INPUT_FORMAT_NX, max_components
117117
#export_subdue_python_json(batch, set_name + '/connected_components.json')
118118

119119
with open(output_folder + '/filter_stats.csv', 'w') as f:
120-
f.write(str(len(components)) + "," + str(filtered_total))
120+
# ds name, all components, components after filtering, filtered too large, filtered too many similar
121+
122+
f.write(dataset_name + "," + str(len(components) + filtered["too_large"] + filtered["too_many_similar"]) + "," + str(len(components)) + "," + str(filtered["too_large"])+ "," + str(filtered["too_many_similar"]))
121123

122124
if __name__ == "__main__":
123-
if len(sys.argv) == 3:
124-
main(sys.argv[1], sys.argv[2])
125-
elif len(sys.argv) == 4:
126-
main(sys.argv[1], sys.argv[2], formatting = sys.argv[3])
125+
if len(sys.argv) == 5:
126+
main(sys.argv[1], sys.argv[2], sys.argv[3], int(sys.argv[4]))
127+
elif len(sys.argv) == 6:
128+
main(sys.argv[1], sys.argv[2], sys.argv[3], int(sys.argv[4]), formatting = sys.argv[5])
127129
else:
128-
print("Unexpected number of arguments. At least input path and output path has to be provided. Optionally as a third argument put NX or LG indicating the input graph formatting.")
130+
print("Unexpected number of arguments. At least input path, output path, dataset name, as well as batch_size has to be provided. Optionally as a fourth argument put NX or LG indicating the input graph formatting.")

mining/pipeline.sh

100644100755
Lines changed: 55 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,67 @@
11
#!/bin/bash
22

3+
# call as pipline.sh [input-path] [output-path] [lib-path]
4+
# e.g. ./pipeline.sh '/home/schwinnez/projects/Frequent Subgraph Mining/case studies/variability_patterns/dataset_1/diffgraphs/' '/home/schwinnez/projects/Frequent Subgraph Mining/case studies/variability_patterns/dataset_1/output/' './lib/'
5+
6+
7+
8+
9+
10+
python_version=$(python3 --version)
11+
echo "Using python version $python_version"
12+
# TODO automatically make sure that python 3 is used
313

4-
# Definition of output directories
5-
output_filtered="$2filtered/"
6-
output_mining="$2mining/"
7-
output_mining_no_duplicates="$2mining_no_duplicates/"
8-
lib_path="$3"
9-
parsemis_path="$3parsemis.jar"
1014
target_subtree_count_for_threshold_estimation=300
1115
threshold=10 # 0 means read threshold from files
16+
batch_size=200
1217
min_size=4
1318
max_size=15
1419

15-
mkdir $2
1620

17-
# Step 1 - Read graph databases, filter and chunk - Default filter config: Not larger than 15 nodes, 30 edges, no more than
18-
python compute_components.py $1 $output_filtered LG
21+
echo "Input: $1"
22+
echo "Output: $2"
23+
echo "Libs: $3"
24+
25+
mkdir -p "$2"
26+
27+
28+
run_dataset(){
29+
echo "Input: $1"
30+
echo "Output: $2"
31+
echo "Libs: $3"
32+
33+
# Definition of output directories
34+
data_set_name="$4"
35+
input_path="$1"
36+
output_filtered="$2filtered/"
37+
output_mining="$2mining/"
38+
output_mining_no_duplicates="$2mining_no_duplicates/"
39+
lib_path="$3"
40+
parsemis_path="$3parsemis.jar"
41+
42+
echo "Running dataset: $data_set_name"
43+
mkdir -p "$2"
44+
45+
46+
# Step 1 - Read graph databases, filter and chunk - Default filter config: Not larger than 15 nodes, 30 edges, no more than
47+
python3 compute_components.py "$input_path" "$output_filtered" "$data_set_name" $batch_size LG
48+
49+
# Step 2 - Compute thresholds - Not better than fixed threshold for Linux dataset
50+
#python bisect_threshold_search.py $lib_path $output_filtered $target_subtree_count_for_threshold_estimation
51+
52+
# Step 3 - Mining
53+
python3 run_parsemis.py "$parsemis_path" "$output_filtered" "$output_mining" $threshold $min_size $max_size
54+
55+
# Step 4 - Remove duplicates
56+
python3 remove_duplicates.py "$output_mining" "$output_mining_no_duplicates" "$data_set_name"
57+
}
58+
59+
# Run for every dataset in input folder
60+
for input_folder in "$1"/*/ ; do
61+
dataset="$(basename "$input_folder")"
62+
output_base="$2/$batch_size-$threshold/$dataset/"
63+
run_dataset "$input_folder" "$output_base" "$3" "$dataset"
64+
done
1965

20-
# Step 2 - Compute thresholds - Not better than fixed threshold for Linux dataset
21-
#python bisect_threshold_search.py $lib_path $output_filtered $target_subtree_count_for_threshold_estimation
2266

23-
# Step 3 - Mining
24-
python run_parsemis.py $parsemis_path $output_filtered $output_mining $threshold $min_size $max_size
2567

26-
# Step 4 - Remove duplicates
27-
python remove_duplicates.py output_mining output_mining_no_duplicates

mining/remove_duplicates.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import sys
44
import os
55

6-
def main(subgraphs_path, results_dir):
6+
def main(subgraphs_path, results_dir, dataset):
77
# Import graphs
88
print("Parsing graph input files")
99
subgraphs = import_tlv_folder(subgraphs_path, parse_support=False)
@@ -21,13 +21,13 @@ def main(subgraphs_path, results_dir):
2121
print("Writing subgraphs.")
2222
export_TLV(subgraphs, results_dir + './results_no_duplicates.lg')
2323
with open(results_dir + './results.csv', 'w') as f:
24-
f.write(str(nb_initial_subgraphs) +"," + str(nb_pruned_subgraphs) + "," + str(removed_duplicates))
24+
f.write(dataset + "," + str(nb_initial_subgraphs) +"," + str(nb_pruned_subgraphs) + "," + str(removed_duplicates))
2525

2626
if __name__ == "__main__":
27-
if len(sys.argv) < 3:
28-
print("Two arguments expected: path to input graph database, path to output results. Run as python remove_duplicates.py [imput_folder] [output_folder]")
27+
if len(sys.argv) < 4:
28+
print("Three arguments expected: path to input graph database, path to output results. Run as python remove_duplicates.py [input_folder] [output_folder] [dataset_name]")
2929

3030
# Create output folder if it doesn't exist yet
3131
os.makedirs(sys.argv[2], exist_ok=True)
3232

33-
main(sys.argv[1], sys.argv[2])
33+
main(sys.argv[1], sys.argv[2], sys.argv[3])

mining/run_parsemis.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,22 +10,23 @@
1010

1111
#### helper to compute memory for mining from machine specs ####
1212
def get_available_memory(fraction=0.8):
13-
1413
with open('/proc/meminfo') as f:
1514
meminfo = f.read()
1615
matched = re.search(r'^MemTotal:\s+(\d+)', meminfo)
1716
if matched:
1817
mem_total_gig = int(matched.groups()[0])/(1024*1024)
1918

2019
available_mem = math.ceil(mem_total_gig * fraction)
20+
return available_mem
2121

2222
########################### Parsemis #####################################################
2323
def run_parsemis(lib_path, in_folder, output_folder, threshold, min_size, max_size, timeout_seconds=180):
2424
memory = str(get_available_memory()) + 'G'
2525
nb_threads = os.cpu_count()
2626
# Template for shell command shell command
27-
parsemis_cmd_template = "java -Xmx{memory} -jar '{parsemis_path}' --graphFile='{in_file}' --outputFile='{out_file}' --minimumFrequency={threshold} --maximumNodeCount={max_size} --minimumNodeCount={min_size} --algorithm=dagma --singleRooted=true --closeGraph=true --zaretsky=true --subdue=true --storeEmbeddings=true --distribution=threads --threads={nb_threads} --swapFile={swap_file}"
27+
parsemis_cmd_template = "java -Xmx{memory} -jar '{parsemis_path}' --graphFile='{in_file}' --outputFile='{out_file}' --minimumFrequency={threshold} --maximumNodeCount={max_size} --minimumNodeCount={min_size} --algorithm=dagma --singleRooted=true --closeGraph=true --zaretsky=true --subdue=true --storeEmbeddings=true --distribution=threads --threads={nb_threads} --swapFile='{swap_file}'"
2828

29+
print(f'Running Frequent Subgraph Mining for input folder: {in_folder}')
2930
# Currently only support for line graph
3031
for idx, in_file in enumerate([file_name for file_name in os.listdir(in_folder) if file_name.endswith('.lg')]):
3132
match_id = re.match(regex_file_id, in_file)
@@ -44,6 +45,8 @@ def run_parsemis(lib_path, in_folder, output_folder, threshold, min_size, max_si
4445
parsemis_cmd = parsemis_cmd_template.format(parsemis_path=lib_path, in_file=in_folder + in_file, out_file=output_folder+'/results_' + str(database_id) + '.lg', swap_file=output_folder+'\swap.tmp', threshold=threshold, max_size=max_size, min_size=min_size, memory=memory, nb_threads=nb_threads )
4546
# Run command (REQUIRES JAVA 8!!!)
4647

48+
print(f'Running Frequent Subgraph Mining. Command: {parsemis_cmd}')
49+
4750
try:
4851
p = subprocess.Popen(parsemis_cmd, shell=True, start_new_session=True)
4952
error_code = p.wait(timeout=timeout_seconds)
@@ -71,5 +74,5 @@ def run_parsemis(lib_path, in_folder, output_folder, threshold, min_size, max_si
7174
# Create output folder if it doesn't exist yet
7275
os.makedirs(sys.argv[3], exist_ok=True)
7376

74-
77+
7578
run_parsemis(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6])

mining/subgraph_statistics.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ def compute_occurrences_lattice_based(self):
251251
# Store number of occurrences here
252252
for lattice_node in lattice_nodes:
253253
self.occurrences_transaction[lattice_node.graph.name] = len(lattice_node.occurrences)
254+
print(lattice_node.occurrences)
254255

255256
def write_as_csv(self, save_path):
256257
with open(save_path, 'w', newline='') as csvfile:

0 commit comments

Comments
 (0)