VariantSync
diff --git a/‎docs/datasets/emacs.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/datasets/emacs.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎genUltimateResults.sh‎
Lines changed: 4 additions & 0 deletions b/‎genUltimateResults.sh‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎mining/compute_components.py‎
Lines changed: 4 additions & 1 deletion b/‎mining/compute_components.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎mining/parse_utils.py‎
Lines changed: 5 additions & 2 deletions b/‎mining/parse_utils.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎mining/run_parsemis.py‎
Lines changed: 3 additions & 2 deletions b/‎mining/run_parsemis.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎mining/subgraph_statistics.py‎
Lines changed: 6 additions & 4 deletions b/‎mining/subgraph_statistics.py‎
Lines changed: 6 additions & 4 deletions
@@ -0,0 +1,3 @@
+Project name | Domain                  | Source code available (**y**es/**n**o)? | Is it a git repository (**y**es/**n**o)? | Repository URL                                               | Clone URL                                                        | Estimated number of commits
+---|-------------------------|-----------------------------------------|-----------------------------------|--------------------------------------------------------------|------------------------------------------------------------------|---
+emacs | text editor             | y                                       | y                                 | https://github.com/emacs-mirror/emacs                        | https://github.com/emacs-mirror/emacs.git                        | 153,926
@@ -0,0 +1,4 @@
+resultsdir=$1
+
+java -cp "target/diffdetective-1.0.0-jar-with-dependencies.jar" org.variantsync.diffdetective.tablegen.MiningResultAccumulator $resultsdir $resultsdir
+echo "genUltimateResults.sh DONE"
@@ -122,7 +122,10 @@ def has_node(graph, label):
 def main(input_folder, output_folder, dataset_name, max_components_per_file=200, formatting=INPUT_FORMAT_NX):
     # TODO doing this with streams and yield would be a nicer solution to the chunking.
     components, nb_of_components_per_diff, filtered = get_components(input_folder, formatting=formatting, filtered=True)
-    
+    for component in components:
+        if not nx.is_directed_acyclic_graph(component):
+            print(f"WARN: THERE ARE NON DAG GRAPHS IN THE INPUT: {component.name}")
+   
     components_batched = [components[i*max_components_per_file:min((i+1)*max_components_per_file, len(components))] for i in range(ceil(len(components)/max_components_per_file))]
 
     # Create output folder if it doesn't exist yet
 
@@ -251,7 +251,7 @@ def import_tlv(path, parse_support=True):
 
     # Some file formats give the support directly, others list all the embeddings. We support both options.
     regex_support = r"Support: (\d+).*"
-    regex_embedding = r"#=> (\d+) .*"
+    regex_embedding = r"#=> ([^\s]+) .*"
 
     # if tlv header continue parsing
     match_header = re.match(regex_header, next_line)
@@ -281,11 +281,14 @@ def import_tlv(path, parse_support=True):
             elif match_support:
                 support = int(match_support.group(1))
             elif match_embedding:
-                support_set.add(int(match_embedding.group(1)))
+                support_set.add(str(match_embedding.group(1)))
             next_line = graph_db.readline()
             if next_line:
                 match_header = re.match(regex_header, next_line)
 
+        if support_set is not None:
+            graph.graph['embeddings'] = str(support_set)
+
         if (support is None and support_set == set() and parse_support):
             print("WARN: Error parsing line graph with graph support. Check format.")
         elif not parse_support:
 
@@ -25,8 +25,9 @@ def run_parsemis(lib_path, in_folder, output_folder, threshold, min_size, max_si
     memory = str(get_available_memory()) + 'G'
     nb_threads = os.cpu_count()
     # Template for shell command shell command
-    parsemis_cmd_template = "java -Xmx{memory} -jar  '{parsemis_path}' --graphFile='{in_file}' --outputFile='{out_file}' --minimumFrequency={threshold} --maximumNodeCount={max_size} --minimumNodeCount={min_size} --algorithm=dagma --singleRooted=true --closeGraph=true --zaretsky=true --subdue=true --storeEmbeddings=true --distribution=threads --threads={nb_threads} --swapFile='{swap_file}'" 
-    
+    parsemis_cmd_template = "java -Xmx{memory} -jar  '{parsemis_path}' --graphFile='{in_file}' --outputFile='{out_file}' --minimumFrequency={threshold} --maximumNodeCount={max_size} --minimumNodeCount={min_size} --algorithm=dagma --storeEmbeddings=true --distribution=threads --threads={nb_threads} --swapFile='{swap_file}'" 
+    #--closeGraph=true --zaretsky=true --subdue=true --singleRooted=true
+
     print(f'Running Frequent Subgraph Mining for input folder: {in_folder}')
     # Currently only support for line graph
     for idx, in_file in enumerate([file_name for file_name in os.listdir(in_folder) if file_name.endswith('.lg')]):
 
@@ -213,6 +213,7 @@ def __init__(self, graph_db: List[nx.DiGraph], subgraphs: List[nx.DiGraph], labe
         self.occurrences_transaction = {}
         self.occurrences_embeddings = {}
         self.occurrences_references = {}
+        self.compressions = {}
         self.lattice = lattice
 
     def _set_label_name(self):
@@ -259,7 +260,8 @@ def compute_occurrences_lattice_based(self):
             self.occurrences_transaction[lattice_node.graph.name] = len(lattice_node.occurrences)
             # the occurrences of the nodes is just the id in the graph database, we want to get the names of the graphs instead
             self.occurrences_references[lattice_node.graph.name] = [self.graph_db[graph_id].name for graph_id in lattice_node.occurrences]
-
+            # absolute compresion (heuristic)
+            self.compressions[lattice_node.graph.name] = (len(lattice_node.occurrences)-1) * (len(lattice_node.graph.nodes()) + len(lattice_node.graph.edges()))
 
     def write_as_csv(self, save_path, additional_tag):
         '''
@@ -277,8 +279,8 @@ def write_as_csv(self, save_path, additional_tag):
                 occurrences_embeddings = self.occurrences_embeddings[subgraph.name] if subgraph.name in self.occurrences_embeddings.keys() else 0
                 occurrences_transaction = self.occurrences_transaction[subgraph.name] if subgraph.name in self.occurrences_transaction.keys() else 0
                 occurrences_references = self.occurrences_references[subgraph.name] if subgraph.name in self.occurrences_references.keys() else []
-                csvwriter.writerow([subgraph.name, additional_tag, occurrences_embeddings, occurrences_transaction, occurrences_references])
-
+                compression = self.compressions[subgraph.name] if subgraph.name in self.compressions.keys() else 0
+                csvwriter.writerow([subgraph.name, additional_tag, occurrences_embeddings, occurrences_transaction, compression, occurrences_references])
 
     def write_as_md(self, save_path, project_name):
         os.makedirs(os.path.dirname(save_path), exist_ok=True)
@@ -291,7 +293,7 @@ def write_as_md(self, save_path, project_name):
                 occurrences_references = self.occurrences_references[subgraph.name] if subgraph.name in self.occurrences_references.keys() else []
                 mdfile.write(f"Frequency: {occurrences_transaction}\n\n")
                 mdfile.write("Put your notes here\n\n")
-                mdfile.write("<details><summary>Matches</summar><p>\n")
+                mdfile.write("<details><summary>Matches</summary><p>\n")
                 for occurrence in occurrences_references:
                     tokens = occurrence.split('$$$')
                     if not len(tokens) >= 2:
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Project name \| Domain \| Source code available (yes/no)? \| Is it a git repository (yes/no)? \| Repository URL \| Clone URL \| Estimated number of commits`
	`2`	`+---\|-------------------------\|-----------------------------------------\|-----------------------------------\|--------------------------------------------------------------\|------------------------------------------------------------------\|---`
	`3`	`+emacs \| text editor \| y \| y \| https://github.com/emacs-mirror/emacs \| https://github.com/emacs-mirror/emacs.git \| 153,926`