Skip to content

Commit e11f6fa

Browse files
author
Christof Tinnes
committed
Improving the composite pattern mining pipeline.
1 parent 3aa7ac4 commit e11f6fa

30 files changed

Lines changed: 131066 additions & 25 deletions

mining/compute_components.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from networkx.readwrite import json_graph
77
import numpy as np
88
import re
9+
import time
910
from parse_utils import export_TLV, export_aids, export_subdue_c_graph, export_subdue_python_json, load_components_networkx, convert_node_link_graph_to_parsemis_directed_graph, convert_node_link_graph_to_subdue_c_graph, convert_node_link_graph_to_subdue_python_graph, import_tlv_folder, export_node_link_graph_from_subdue_c_graph, convert_node_link_graph_to_nx_graph
1011

1112
INPUT_FORMAT_NX = "NX"
@@ -94,8 +95,29 @@ def get_components(input_folder, formatting=INPUT_FORMAT_NX, filtered=False):
9495
components, nb_of_components_per_diff, filtered_total = get_graph_components(graphs, filtered=filtered, filter_config = FilterConfig())
9596

9697

98+
99+
do_statistics(components)
97100
return components, nb_of_components_per_diff, filtered_total
98101

102+
def do_statistics(components):
103+
have_specialization = [component for component in components if has_node(component, 'c4')]
104+
have_generalization = [component for component in components if has_node(component, 'c5')]
105+
have_refactoring = [component for component in components if has_node(component, 'c7')]
106+
have_reconfiguration = [component for component in components if has_node(component, 'c6')]
107+
have_s_g = [component for component in have_specialization if component in have_generalization]
108+
have_s_ref = [component for component in have_specialization if component in have_refactoring]
109+
have_s_rec = [component for component in have_specialization if component in have_reconfiguration]
110+
have_g_ref = [component for component in have_generalization if component in have_refactoring]
111+
have_g_rec = [component for component in have_generalization if component in have_reconfiguration]
112+
have_ref_rec = [component for component in have_refactoring if component in have_reconfiguration]
113+
total = len(components)
114+
print(f"total: {len(components)}; s: {len(have_specialization)/total}; g: {len(have_generalization)/total}; ref: {len(have_refactoring)/total}; rec: {len(have_reconfiguration)/total};")
115+
print(f"s_g: {len(have_s_g) / total}; s_ref: {len(have_s_ref) / total}; s_rec: {len(have_s_rec) / total}; g_ref: {len(have_g_ref) / total}; g_rec: {len(have_g_rec) / total}; ref_rec: {len(have_ref_rec) / total}")
116+
print(f"s_g: {len(have_specialization) * len(have_generalization) / (total**2)}; s_ref: {len(have_specialization) * len(have_refactoring) / (total**2)}; s_rec: {len(have_specialization) * len(have_reconfiguration) / (total**2)}; g_ref: {len(have_generalization) * len(have_refactoring) / (total ** 2)}; g_rec: {len(have_generalization) * len(have_reconfiguration) / (total**2)} ref_rec: {len(have_refactoring) * len(have_reconfiguration) / (total**2)};")
117+
118+
def has_node(graph, label):
119+
120+
return label in [node[1]['label'] for node in list(graph.nodes(data=True))]
99121

100122
def main(input_folder, output_folder, dataset_name, max_components_per_file=200, formatting=INPUT_FORMAT_NX):
101123
# TODO doing this with streams and yield would be a nicer solution to the chunking.
@@ -122,9 +144,13 @@ def main(input_folder, output_folder, dataset_name, max_components_per_file=200,
122144
f.write(dataset_name + "," + str(len(components) + filtered["too_large"] + filtered["too_many_similar"]) + "," + str(len(components)) + "," + str(filtered["too_large"])+ "," + str(filtered["too_many_similar"]))
123145

124146
if __name__ == "__main__":
147+
start_time = time.time()
125148
if len(sys.argv) == 5:
126149
main(sys.argv[1], sys.argv[2], sys.argv[3], int(sys.argv[4]))
127150
elif len(sys.argv) == 6:
128151
main(sys.argv[1], sys.argv[2], sys.argv[3], int(sys.argv[4]), formatting = sys.argv[5])
129152
else:
130153
print("Unexpected number of arguments. At least input path, output path, dataset name, as well as batch_size has to be provided. Optionally as a fourth argument put NX or LG indicating the input graph formatting.")
154+
end_time = time.time()
155+
with open(sys.argv[2] + 'time.txt', 'w') as f:
156+
f.write(str(end_time-start_time))

mining/extract_subgraphs.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
input_base=$1
2+
output_base=$2
3+
4+
mkdir -p "$output_base"
5+
mkdir -p "$output_base/temp/"
6+
7+
# Run for every dataset in input folder
8+
for dataset in "$input_base"/*/ ; do
9+
dataset_name="$(basename "$dataset")"
10+
subgraphs_file="$dataset/mining_no_duplicates/results_no_duplicates.lg"
11+
if [ -e $subgraphs_file ]
12+
then cp "$subgraphs_file" "$output_base/temp/$dataset_name.lg"
13+
fi
14+
done
15+
16+
# Remove duplicates and output every subgraph as single file
17+
python3 remove_duplicates.py "$output_base/temp/" "$output_base" "all subgraphs" "True"
18+
rm -rf "$output_base/temp/"

mining/pipeline.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@ echo "Using python version $python_version"
1313

1414
target_subtree_count_for_threshold_estimation=300
1515
threshold=10 # 0 means read threshold from files
16-
batch_size=200
17-
min_size=4
16+
batch_size=1000
17+
timeout_mining=120
18+
min_size=5
1819
max_size=15
1920

2021

@@ -50,7 +51,7 @@ run_dataset(){
5051
#python bisect_threshold_search.py $lib_path $output_filtered $target_subtree_count_for_threshold_estimation
5152

5253
# Step 3 - Mining
53-
python3 run_parsemis.py "$parsemis_path" "$output_filtered" "$output_mining" $threshold $min_size $max_size
54+
python3 run_parsemis.py "$parsemis_path" "$output_filtered" "$output_mining" $threshold $min_size $max_size $timeout_mining
5455

5556
# Step 4 - Remove duplicates
5657
python3 remove_duplicates.py "$output_mining" "$output_mining_no_duplicates" "$data_set_name"

mining/project_list.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
Project name | Domain | Source code available (**y**es/**n**o)? | Is it a git repository (**y**es/**n**o)? | Repository URL | Clone URL | Estimated number of commits
2+
---|-------------------------|-----------------------------------------|-----------------------------------|--------------------------------------------------------------|------------------------------------------------------------------|---
3+
apache-httpd | web server | y | y | https://github.com/apache/httpd | https://github.com/apache/httpd.git | 32,927
4+
berkeley-db-libdb | database system | y | y | https://github.com/berkeleydb/libdb | https://github.com/berkeleydb/libdb.git | 7
5+
busybox | embedded systems | y | y | https://git.busybox.net/busybox | https://git.busybox.net/busybox | 17,447
6+
cherokee-webserver | web server | y | y | https://github.com/cherokee/webserver | https://github.com/cherokee/webserver.git | 5,853
7+
clamav | antivirus program | y | y | https://github.com/Cisco-Talos/clamav | https://github.com/Cisco-Talos/clamav.git | 10,656
8+
dia | diagramming software | y | y | https://github.com/GNOME/dia | https://github.com/GNOME/dia.git | 6,666
9+
emacs | text editor | y | y | https://github.com/emacs-mirror/emacs | https://github.com/emacs-mirror/emacs.git | 153,926
10+
freebsd | operating system | y | y | https://github.com/freebsd/freebsd-src | https://github.com/freebsd/freebsd-src.git | 271,937
11+
gcc | compiler framework | y | y | https://github.com/gcc-mirror/gcc | https://github.com/gcc-mirror/gcc.git | 191,255
12+
ghostscript | postscript interpreter | y | y | https://github.com/ArtifexSoftware/ghostpdl | https://github.com/ArtifexSoftware/ghostpdl.git | 22,137
13+
gimp | graphics editor | y | y | https://gitlab.gnome.org/GNOME/gimp | https://gitlab.gnome.org/GNOME/gimp.git | 47,782
14+
glibc | programming library | y | y | https://sourceware.org/git/?p=glibc.git | https://sourceware.org/git/glibc.git | 38,318
15+
gnumeric | spreadsheet application | y | y | https://gitlab.gnome.org/GNOME/gnumeric | https://gitlab.gnome.org/GNOME/gnumeric.git | 24,134
16+
gnuplot | plotting tool | y | y | https://github.com/gnuplot/gnuplot | https://github.com/gnuplot/gnuplot.git | 11,748
17+
Godot | game engine | y | y | https://github.com/godotengine/godot | https://github.com/godotengine/godot.git | 40,742
18+
irssi | IRC client | y | y | https://github.com/irssi/irssi | https://github.com/irssi/irssi.git | 6,346
19+
libssh | network | y | y | https://gitlab.com/libssh/libssh-mirror | https://gitlab.com/libssh/libssh-mirror.git | 5,349
20+
libxml2 | XML library | y | y | https://gitlab.gnome.org/GNOME/libxml2 | https://gitlab.gnome.org/GNOME/libxml2.git | 5,130
21+
lighttpd | web server | y | y | https://git.lighttpd.net/lighttpd/lighttpd1.4 | https://git.lighttpd.net/lighttpd/lighttpd1.4.git | 4,431
22+
linux | operating system | y | y | https://github.com/torvalds/linux | https://github.com/torvalds/linux.git | 1,072,142
23+
lynx | web browser | y | y | https://github.com/lynx/lynx | https://github.com/lynx/lynx.git | 125
24+
Marlin | 3d printing | y | y | https://github.com/MarlinFirmware/Marlin | https://github.com/MarlinFirmware/Marlin.git | 19,258
25+
minix | operating system | y | y | https://github.com/Stichting-MINIX-Research-Foundation/minix | https://github.com/Stichting-MINIX-Research-Foundation/minix.git | 7,153
26+
mplayer-svn | media player | y | y | https://github.com/pigoz/mplayer-svn | https://github.com/pigoz/mplayer-svn.git | 37,992
27+
MPSolve | mathematical software | y | y | https://github.com/robol/MPSolve | https://github.com/robol/MPSolve.git | 1,773
28+
openldap | LDAP directory service | y | y | https://github.com/openldap/openldap | https://github.com/openldap/openldap.git | 23,928
29+
opensolaris | operating system | y | y | https://github.com/kofemann/opensolaris | https://github.com/kofemann/opensolaris.git | 11,422
30+
openvpn | security application | y | y | https://github.com/OpenVPN/openvpn | https://github.com/OpenVPN/openvpn.git | 3,118
31+
parrot | virtual machine | y | y | https://github.com/parrot/parrot | https://github.com/parrot/parrot.git | 49,989
32+
php | program interpreter | y | y | https://github.com/php/php-src | https://github.com/php/php-src.git | 127,609
33+
Pidgin | instant messenger | y | y | https://github.com/Intika-Pidgin/Pidgin | https://github.com/Intika-Pidgin/Pidgin.git | 40,097
34+
postgresql | database system | y | y | https://github.com/postgres/postgres | https://github.com/postgres/postgres.git | 52,881
35+
privoxy | proxy server | y | y | https://www.privoxy.org/gitweb/?p=privoxy.git;a=summary | https://www.privoxy.org/git/privoxy.git | 7,558
36+
cpython | program interpreter | y | y | https://github.com/python/cpython | https://github.com/python/cpython.git | 112,096
37+
sendmail | mail transfer agent | y | y | https://github.com/guileen/node-sendmail | https://github.com/guileen/node-sendmail.git | 86
38+
sqlite | databases | y | y | https://github.com/smparkes/sqlite | https://github.com/smparkes/sqlite.git | 8,664
39+
subversion | revision control system | y | y | https://github.com/apache/subversion | https://github.com/apache/subversion.git | 60,030
40+
sylpheed | e-mail client | y | y | https://github.com/jan0sch/sylpheed | https://github.com/jan0sch/sylpheed.git | 2,682
41+
tcl | program interpreter | y | y | https://github.com/tcltk/tcl | https://github.com/tcltk/tcl.git | 24,396
42+
vim | text editor | y | y | https://github.com/vim/vim | https://github.com/vim/vim.git | 15,274
43+
xfig | vector graphics editor | y | y | https://github.com/hhoeflin/xfig | https://github.com/hhoeflin/xfig.git | 9
44+
xine-lib | media library | y | y | https://github.com/rpmfusion/xine-lib | https://github.com/rpmfusion/xine-lib.git | 114
45+
xorg-server | X server | y | y | https://gitlab.freedesktop.org/xorg/xserver | https://gitlab.freedesktop.org/xorg/xserver.git | 17,786
46+
xterm | terminal emulator | y | y | https://github.com/Maximus5/xterm | https://github.com/Maximus5/xterm.git | 112

mining/remove_duplicates.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
from parse_utils import import_tlv_folder, export_TLV
33
import sys
44
import os
5+
import time
56

6-
def main(subgraphs_path, results_dir, dataset):
7+
def main(subgraphs_path, results_dir, dataset, output_single=False):
78
# Import graphs
89
print("Parsing graph input files")
910
subgraphs = import_tlv_folder(subgraphs_path, parse_support=False)
@@ -16,18 +17,28 @@ def main(subgraphs_path, results_dir, dataset):
1617
print("Removed %d duplicates" % removed_duplicates)
1718

1819
# Load graphs
19-
20-
2120
print("Writing subgraphs.")
22-
export_TLV(subgraphs, results_dir + './results_no_duplicates.lg')
23-
with open(results_dir + './results.csv', 'w') as f:
21+
if output_single:
22+
for i in range(len(subgraphs)):
23+
export_TLV([subgraphs[i]], results_dir + '/' + str(i) + '.lg')
24+
else:
25+
export_TLV(subgraphs, results_dir + '/results_no_duplicates.lg')
26+
27+
with open(results_dir + '/results.csv', 'w') as f:
2428
f.write(dataset + "," + str(nb_initial_subgraphs) +"," + str(nb_pruned_subgraphs) + "," + str(removed_duplicates))
2529

2630
if __name__ == "__main__":
27-
if len(sys.argv) < 4:
28-
print("Three arguments expected: path to input graph database, path to output results. Run as python remove_duplicates.py [input_folder] [output_folder] [dataset_name]")
31+
if len(sys.argv) < 4 or len(sys.argv) > 5:
32+
print("Three or four arguments expected: path to input graph database, path to output results, and optionally if every subgraph should be written to a single file. Run as python remove_duplicates.py [input_folder] [output_folder] [dataset_name] [True/False]")
2933

3034
# Create output folder if it doesn't exist yet
3135
os.makedirs(sys.argv[2], exist_ok=True)
32-
33-
main(sys.argv[1], sys.argv[2], sys.argv[3])
36+
37+
start_time = time.time()
38+
if len(sys.argv) == 4:
39+
main(sys.argv[1], sys.argv[2], sys.argv[3])
40+
if len(sys.argv) == 5:
41+
main(sys.argv[1], sys.argv[2], sys.argv[3], bool(sys.argv[4]))
42+
end_time = time.time()
43+
with open(sys.argv[2] + 'time.txt', 'w') as f:
44+
f.write(str(end_time-start_time))

mining/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
networkx
2+
numpy

mining/run_parsemis.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import re
66

77
import math
8+
import time
89

910
regex_file_id = r".*_(\d+).*"
1011

@@ -68,11 +69,14 @@ def run_parsemis(lib_path, in_folder, output_folder, threshold, min_size, max_si
6869

6970

7071
if __name__ == "__main__":
71-
if len(sys.argv) < 7:
72-
print("Call like python run_parsemis.py [lib_path] [in_folder] [out_folder] [threshold] [min_size] [max_size]")
72+
if len(sys.argv) < 8:
73+
print("Call like python run_parsemis.py [lib_path] [in_folder] [out_folder] [threshold] [min_size] [max_size] [timeout_seconds]")
7374

7475
# Create output folder if it doesn't exist yet
7576
os.makedirs(sys.argv[3], exist_ok=True)
7677

77-
78-
run_parsemis(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6])
78+
start_time = time.time()
79+
run_parsemis(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], int(sys.argv[7]))
80+
end_time = time.time()
81+
with open(sys.argv[3] + 'time.txt', 'w') as f:
82+
f.write(str(end_time-start_time))

0 commit comments

Comments
 (0)