VariantSync
diff --git a/‎mining/compute_components.py‎
Lines changed: 26 additions & 0 deletions b/‎mining/compute_components.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎mining/extract_subgraphs.sh‎
Lines changed: 18 additions & 0 deletions b/‎mining/extract_subgraphs.sh‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎mining/pipeline.sh‎
Lines changed: 4 additions & 3 deletions b/‎mining/pipeline.sh‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎mining/project_list.md‎
Lines changed: 46 additions & 0 deletions b/‎mining/project_list.md‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎mining/remove_duplicates.py‎
Lines changed: 20 additions & 9 deletions b/‎mining/remove_duplicates.py‎
Lines changed: 20 additions & 9 deletions
diff --git a/‎mining/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎mining/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mining/run_parsemis.py‎
Lines changed: 8 additions & 4 deletions b/‎mining/run_parsemis.py‎
Lines changed: 8 additions & 4 deletions
@@ -6,6 +6,7 @@
 from networkx.readwrite import json_graph
 import numpy as np
 import re
+import time
 from parse_utils import export_TLV, export_aids, export_subdue_c_graph, export_subdue_python_json, load_components_networkx, convert_node_link_graph_to_parsemis_directed_graph, convert_node_link_graph_to_subdue_c_graph, convert_node_link_graph_to_subdue_python_graph, import_tlv_folder, export_node_link_graph_from_subdue_c_graph, convert_node_link_graph_to_nx_graph
 
 INPUT_FORMAT_NX = "NX"
@@ -94,8 +95,29 @@ def get_components(input_folder, formatting=INPUT_FORMAT_NX, filtered=False):
         components, nb_of_components_per_diff, filtered_total = get_graph_components(graphs, filtered=filtered, filter_config = FilterConfig())
 
 
+
+    do_statistics(components)
     return components, nb_of_components_per_diff, filtered_total
 
+def do_statistics(components): 
+    have_specialization = [component for component in components if has_node(component, 'c4')]
+    have_generalization =  [component for component in components if has_node(component, 'c5')]
+    have_refactoring =  [component for component in components if has_node(component, 'c7')]
+    have_reconfiguration =  [component for component in components if has_node(component, 'c6')]
+    have_s_g = [component for component in have_specialization if component in have_generalization]
+    have_s_ref = [component for component in have_specialization if component in have_refactoring]
+    have_s_rec =  [component for component in have_specialization if component in have_reconfiguration]
+    have_g_ref =  [component for component in have_generalization if component in have_refactoring]
+    have_g_rec = [component for component in have_generalization if component in have_reconfiguration]
+    have_ref_rec = [component for component in have_refactoring if component in have_reconfiguration]
+    total =  len(components)
+    print(f"total: {len(components)}; s: {len(have_specialization)/total}; g: {len(have_generalization)/total}; ref: {len(have_refactoring)/total}; rec: {len(have_reconfiguration)/total};")
+    print(f"s_g: {len(have_s_g) / total}; s_ref: {len(have_s_ref) / total}; s_rec: {len(have_s_rec) / total}; g_ref: {len(have_g_ref) / total}; g_rec: {len(have_g_rec) / total}; ref_rec: {len(have_ref_rec) / total}")
+    print(f"s_g: {len(have_specialization) * len(have_generalization) / (total**2)}; s_ref: {len(have_specialization) * len(have_refactoring) / (total**2)}; s_rec: {len(have_specialization) * len(have_reconfiguration) / (total**2)}; g_ref: {len(have_generalization) * len(have_refactoring) / (total ** 2)};  g_rec: {len(have_generalization) * len(have_reconfiguration) / (total**2)} ref_rec: {len(have_refactoring) * len(have_reconfiguration) / (total**2)};")
+
+def has_node(graph, label):
+
+    return label in [node[1]['label'] for node in list(graph.nodes(data=True))]
 
 def main(input_folder, output_folder, dataset_name, max_components_per_file=200, formatting=INPUT_FORMAT_NX):
     # TODO doing this with streams and yield would be a nicer solution to the chunking.
@@ -122,9 +144,13 @@ def main(input_folder, output_folder, dataset_name, max_components_per_file=200,
         f.write(dataset_name + "," + str(len(components) + filtered["too_large"] + filtered["too_many_similar"]) + "," + str(len(components)) + "," + str(filtered["too_large"])+ "," + str(filtered["too_many_similar"]))
 
 if __name__ == "__main__":
+    start_time = time.time()
     if len(sys.argv) == 5:
         main(sys.argv[1], sys.argv[2], sys.argv[3], int(sys.argv[4]))
     elif len(sys.argv) == 6:
         main(sys.argv[1], sys.argv[2], sys.argv[3], int(sys.argv[4]), formatting = sys.argv[5])	
     else:
         print("Unexpected number of arguments. At least input path, output path, dataset name, as well as batch_size has to be provided. Optionally as a fourth argument put NX or LG indicating the input graph formatting.")
+    end_time = time.time()
+    with open(sys.argv[2] + 'time.txt', 'w') as f:
+        f.write(str(end_time-start_time))
@@ -0,0 +1,18 @@
+input_base=$1
+output_base=$2
+
+mkdir -p "$output_base"
+mkdir -p "$output_base/temp/"
+
+# Run for every dataset in input folder
+for dataset in "$input_base"/*/ ; do
+    dataset_name="$(basename "$dataset")"
+    subgraphs_file="$dataset/mining_no_duplicates/results_no_duplicates.lg" 
+    if [ -e $subgraphs_file ]
+      then cp "$subgraphs_file" "$output_base/temp/$dataset_name.lg"
+    fi
+done
+
+# Remove duplicates and output every subgraph as single file
+python3 remove_duplicates.py "$output_base/temp/" "$output_base" "all subgraphs" "True"
+rm -rf  "$output_base/temp/"
@@ -13,8 +13,9 @@ echo "Using python version $python_version"
 
 target_subtree_count_for_threshold_estimation=300
 threshold=10 # 0 means read threshold from files
-batch_size=200
-min_size=4
+batch_size=1000
+timeout_mining=120
+min_size=5
 max_size=15
 
 
@@ -50,7 +51,7 @@ run_dataset(){
 	#python bisect_threshold_search.py $lib_path $output_filtered $target_subtree_count_for_threshold_estimation
 
 	# Step 3 - Mining
-	python3 run_parsemis.py "$parsemis_path" "$output_filtered" "$output_mining" $threshold $min_size $max_size
+	python3 run_parsemis.py "$parsemis_path" "$output_filtered" "$output_mining" $threshold $min_size $max_size $timeout_mining
 
 	# Step 4 - Remove duplicates
 	python3 remove_duplicates.py "$output_mining" "$output_mining_no_duplicates" "$data_set_name" 
 
@@ -0,0 +1,46 @@
+Project name | Domain                  | Source code available (**y**es/**n**o)? | Is it a git repository (**y**es/**n**o)? | Repository URL                                               | Clone URL                                                        | Estimated number of commits
+---|-------------------------|-----------------------------------------|-----------------------------------|--------------------------------------------------------------|------------------------------------------------------------------|---
+apache-httpd | web server              | y                                       | y                                 | https://github.com/apache/httpd                              | https://github.com/apache/httpd.git                              |  32,927
+berkeley-db-libdb | database system         | y                                       | y                                 | https://github.com/berkeleydb/libdb                          | https://github.com/berkeleydb/libdb.git                          | 7
+busybox | embedded systems | y                                       | y                                 | https://git.busybox.net/busybox                              | https://git.busybox.net/busybox                                  | 17,447
+cherokee-webserver | web server              | y                                       | y                                 | https://github.com/cherokee/webserver                        | https://github.com/cherokee/webserver.git                        | 5,853
+clamav | antivirus program       | y                                       | y                                 | https://github.com/Cisco-Talos/clamav                        | https://github.com/Cisco-Talos/clamav.git                        | 10,656
+dia | diagramming software    | y                                       | y                                 | https://github.com/GNOME/dia                                 | https://github.com/GNOME/dia.git                                 | 6,666
+emacs | text editor             | y                                       | y                                 | https://github.com/emacs-mirror/emacs                        | https://github.com/emacs-mirror/emacs.git                        | 153,926
+freebsd | operating system        | y                                       | y                                 | https://github.com/freebsd/freebsd-src                       | https://github.com/freebsd/freebsd-src.git                       | 271,937
+gcc | compiler framework      | y                                       | y                                 | https://github.com/gcc-mirror/gcc                            | https://github.com/gcc-mirror/gcc.git                            | 191,255
+ghostscript | postscript interpreter  | y                                       | y                                 | https://github.com/ArtifexSoftware/ghostpdl                  | https://github.com/ArtifexSoftware/ghostpdl.git                  | 22,137
+gimp | graphics editor         | y                                       | y                                 | https://gitlab.gnome.org/GNOME/gimp                          | https://gitlab.gnome.org/GNOME/gimp.git                          | 47,782
+glibc | programming library     | y                                       | y                                 | https://sourceware.org/git/?p=glibc.git                      | https://sourceware.org/git/glibc.git                             | 38,318
+gnumeric | spreadsheet application | y                                       | y                                 | https://gitlab.gnome.org/GNOME/gnumeric                      | https://gitlab.gnome.org/GNOME/gnumeric.git                      | 24,134
+gnuplot | plotting tool           | y                                       | y                                 | https://github.com/gnuplot/gnuplot                           | https://github.com/gnuplot/gnuplot.git                           | 11,748
+Godot | game engine             | y                                       | y                                 | https://github.com/godotengine/godot                         | https://github.com/godotengine/godot.git                         | 40,742
+irssi | IRC client              | y                                       | y                                 | https://github.com/irssi/irssi                               | https://github.com/irssi/irssi.git                               | 6,346
+libssh | network                 | y                                       | y                                 | https://gitlab.com/libssh/libssh-mirror                      | https://gitlab.com/libssh/libssh-mirror.git                      | 5,349
+libxml2 | XML library             | y                                       | y                                 | https://gitlab.gnome.org/GNOME/libxml2                       | https://gitlab.gnome.org/GNOME/libxml2.git                       | 5,130
+lighttpd | web server              | y                                       | y                                 | https://git.lighttpd.net/lighttpd/lighttpd1.4                | https://git.lighttpd.net/lighttpd/lighttpd1.4.git                | 4,431
+linux | operating system        | y                                       | y                                 | https://github.com/torvalds/linux                            | https://github.com/torvalds/linux.git                            | 1,072,142
+lynx | web browser             | y                                       | y                                 | https://github.com/lynx/lynx                                 | https://github.com/lynx/lynx.git                                 | 125
+Marlin | 3d printing             | y                                       | y                                 | https://github.com/MarlinFirmware/Marlin                     | https://github.com/MarlinFirmware/Marlin.git                     | 19,258
+minix | operating system        | y                                       | y                                 | https://github.com/Stichting-MINIX-Research-Foundation/minix | https://github.com/Stichting-MINIX-Research-Foundation/minix.git | 7,153
+mplayer-svn | media player            | y                                       | y                                 | https://github.com/pigoz/mplayer-svn                         | https://github.com/pigoz/mplayer-svn.git                         | 37,992
+MPSolve | mathematical software   | y                                       | y                                 | https://github.com/robol/MPSolve                             | https://github.com/robol/MPSolve.git                             | 1,773
+openldap | LDAP directory service  | y                                       | y                                 | https://github.com/openldap/openldap                         | https://github.com/openldap/openldap.git                         | 23,928
+opensolaris | operating system        | y                                       | y                                 | https://github.com/kofemann/opensolaris                      | https://github.com/kofemann/opensolaris.git                      | 11,422
+openvpn | security application    | y                                       | y                                 | https://github.com/OpenVPN/openvpn                           | https://github.com/OpenVPN/openvpn.git                           | 3,118
+parrot | virtual machine         | y                                       | y                                 | https://github.com/parrot/parrot                             | https://github.com/parrot/parrot.git                             | 49,989
+php | program interpreter     | y                                       | y                                 | https://github.com/php/php-src                               | https://github.com/php/php-src.git                               | 127,609
+Pidgin | instant messenger       | y                                       | y                                 | https://github.com/Intika-Pidgin/Pidgin                      | https://github.com/Intika-Pidgin/Pidgin.git                      | 40,097
+postgresql | database system         | y                                       | y                                 | https://github.com/postgres/postgres                         | https://github.com/postgres/postgres.git                         | 52,881
+privoxy | proxy server            | y                                       | y                                 | https://www.privoxy.org/gitweb/?p=privoxy.git;a=summary      | https://www.privoxy.org/git/privoxy.git                          | 7,558
+cpython | program interpreter     | y                                       | y                                 | https://github.com/python/cpython                            | https://github.com/python/cpython.git                            | 112,096
+sendmail | mail transfer agent     | y                                       | y                                 | https://github.com/guileen/node-sendmail                     | https://github.com/guileen/node-sendmail.git                     | 86
+sqlite | databases               | y                                       | y                                 | https://github.com/smparkes/sqlite                           | https://github.com/smparkes/sqlite.git                           | 8,664
+subversion | revision control system | y                                       | y                                 | https://github.com/apache/subversion                         | https://github.com/apache/subversion.git                         | 60,030
+sylpheed | e-mail client           | y                                       | y                                 | https://github.com/jan0sch/sylpheed                          | https://github.com/jan0sch/sylpheed.git                          | 2,682
+tcl | program interpreter     | y                                       | y                                 | https://github.com/tcltk/tcl                                 | https://github.com/tcltk/tcl.git                                 | 24,396
+vim | text editor             | y                                       | y                                 | https://github.com/vim/vim                                   | https://github.com/vim/vim.git                                   | 15,274
+xfig | vector graphics editor  | y                                       | y                                 | https://github.com/hhoeflin/xfig                             | https://github.com/hhoeflin/xfig.git                             | 9
+xine-lib | media library           | y                                       | y                                 | https://github.com/rpmfusion/xine-lib                        | https://github.com/rpmfusion/xine-lib.git                        | 114
+xorg-server | X server                | y                                       | y                                 | https://gitlab.freedesktop.org/xorg/xserver                  | https://gitlab.freedesktop.org/xorg/xserver.git                  | 17,786
+xterm | terminal emulator       | y                                       | y                                 | https://github.com/Maximus5/xterm                            | https://github.com/Maximus5/xterm.git                            | 112
@@ -2,8 +2,9 @@
 from parse_utils import import_tlv_folder, export_TLV
 import sys
 import os
+import time
 
-def main(subgraphs_path, results_dir, dataset):
+def main(subgraphs_path, results_dir, dataset, output_single=False):
     # Import graphs
     print("Parsing graph input files")
     subgraphs = import_tlv_folder(subgraphs_path, parse_support=False)
@@ -16,18 +17,28 @@ def main(subgraphs_path, results_dir, dataset):
     print("Removed %d duplicates" % removed_duplicates)
 
     # Load graphs
-    
-    
     print("Writing subgraphs.")
-    export_TLV(subgraphs, results_dir + './results_no_duplicates.lg')
-    with open(results_dir + './results.csv', 'w') as f:
+    if output_single:
+        for i in range(len(subgraphs)):
+            export_TLV([subgraphs[i]], results_dir + '/' + str(i) + '.lg')
+    else:
+        export_TLV(subgraphs, results_dir + '/results_no_duplicates.lg')
+    
+    with open(results_dir + '/results.csv', 'w') as f:
         f.write(dataset + "," + str(nb_initial_subgraphs) +"," + str(nb_pruned_subgraphs) + "," + str(removed_duplicates))
 
 if __name__ == "__main__":
-    if len(sys.argv) < 4:
-        print("Three arguments expected: path to input graph database, path to output results. Run as python remove_duplicates.py [input_folder] [output_folder] [dataset_name]")
+    if len(sys.argv) < 4 or len(sys.argv) > 5:
+        print("Three or four arguments expected: path to input graph database, path to output results, and optionally if every subgraph should be written to a single file. Run as python remove_duplicates.py [input_folder] [output_folder] [dataset_name] [True/False]")
 
     # Create output folder if it doesn't exist yet
     os.makedirs(sys.argv[2], exist_ok=True)
-    
-    main(sys.argv[1], sys.argv[2], sys.argv[3])
+ 
+    start_time = time.time()
+    if len(sys.argv) == 4:
+        main(sys.argv[1], sys.argv[2], sys.argv[3])
+    if len(sys.argv) == 5:
+        main(sys.argv[1], sys.argv[2], sys.argv[3], bool(sys.argv[4]))
+    end_time = time.time()
+    with open(sys.argv[2] + 'time.txt', 'w') as f:
+        f.write(str(end_time-start_time))
@@ -1 +1,2 @@
 networkx
+numpy
@@ -5,6 +5,7 @@
 import re
 
 import math
+import time
 
 regex_file_id = r".*_(\d+).*"
 
@@ -68,11 +69,14 @@ def run_parsemis(lib_path, in_folder, output_folder, threshold, min_size, max_si
 
 
 if __name__ == "__main__":
-    if len(sys.argv) < 7:
-        print("Call like python run_parsemis.py [lib_path] [in_folder] [out_folder] [threshold] [min_size] [max_size]")
+    if len(sys.argv) < 8:
+        print("Call like python run_parsemis.py [lib_path] [in_folder] [out_folder] [threshold] [min_size] [max_size] [timeout_seconds]")
 
     # Create output folder if it doesn't exist yet
     os.makedirs(sys.argv[3], exist_ok=True)
 
-
-    run_parsemis(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6])
+    start_time = time.time()
+    run_parsemis(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], int(sys.argv[7]))
+    end_time = time.time()
+    with open(sys.argv[3] + 'time.txt', 'w') as f:
+        f.write(str(end_time-start_time))