Skip to content

Commit 432de6a

Browse files
authored
Merge pull request #36 from VariantSync/tier-bm
Update Main
2 parents 5119f4c + ad36532 commit 432de6a

46 files changed

Lines changed: 785 additions & 101 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

docs/datasets/emacs.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Project name | Domain | Source code available (**y**es/**n**o)? | Is it a git repository (**y**es/**n**o)? | Repository URL | Clone URL | Estimated number of commits
2+
---|-------------------------|-----------------------------------------|-----------------------------------|--------------------------------------------------------------|------------------------------------------------------------------|---
3+
emacs | text editor | y | y | https://github.com/emacs-mirror/emacs | https://github.com/emacs-mirror/emacs.git | 153,926

genUltimateResults.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
resultsdir=$1
2+
3+
java -cp "target/diffdetective-1.0.0-jar-with-dependencies.jar" org.variantsync.diffdetective.tablegen.MiningResultAccumulator $resultsdir $resultsdir
4+
echo "genUltimateResults.sh DONE"

mining/compute_components.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,10 @@ def has_node(graph, label):
122122
def main(input_folder, output_folder, dataset_name, max_components_per_file=200, formatting=INPUT_FORMAT_NX):
123123
# TODO doing this with streams and yield would be a nicer solution to the chunking.
124124
components, nb_of_components_per_diff, filtered = get_components(input_folder, formatting=formatting, filtered=True)
125-
125+
for component in components:
126+
if not nx.is_directed_acyclic_graph(component):
127+
print(f"WARN: THERE ARE NON DAG GRAPHS IN THE INPUT: {component.name}")
128+
126129
components_batched = [components[i*max_components_per_file:min((i+1)*max_components_per_file, len(components))] for i in range(ceil(len(components)/max_components_per_file))]
127130

128131
# Create output folder if it doesn't exist yet

mining/parse_utils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ def import_tlv(path, parse_support=True):
251251

252252
# Some file formats give the support directly, others list all the embeddings. We support both options.
253253
regex_support = r"Support: (\d+).*"
254-
regex_embedding = r"#=> (\d+) .*"
254+
regex_embedding = r"#=> ([^\s]+) .*"
255255

256256
# if tlv header continue parsing
257257
match_header = re.match(regex_header, next_line)
@@ -281,11 +281,14 @@ def import_tlv(path, parse_support=True):
281281
elif match_support:
282282
support = int(match_support.group(1))
283283
elif match_embedding:
284-
support_set.add(int(match_embedding.group(1)))
284+
support_set.add(str(match_embedding.group(1)))
285285
next_line = graph_db.readline()
286286
if next_line:
287287
match_header = re.match(regex_header, next_line)
288288

289+
if support_set is not None:
290+
graph.graph['embeddings'] = str(support_set)
291+
289292
if (support is None and support_set == set() and parse_support):
290293
print("WARN: Error parsing line graph with graph support. Check format.")
291294
elif not parse_support:

mining/run_parsemis.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ def run_parsemis(lib_path, in_folder, output_folder, threshold, min_size, max_si
2525
memory = str(get_available_memory()) + 'G'
2626
nb_threads = os.cpu_count()
2727
# Template for shell command shell command
28-
parsemis_cmd_template = "java -Xmx{memory} -jar '{parsemis_path}' --graphFile='{in_file}' --outputFile='{out_file}' --minimumFrequency={threshold} --maximumNodeCount={max_size} --minimumNodeCount={min_size} --algorithm=dagma --singleRooted=true --closeGraph=true --zaretsky=true --subdue=true --storeEmbeddings=true --distribution=threads --threads={nb_threads} --swapFile='{swap_file}'"
29-
28+
parsemis_cmd_template = "java -Xmx{memory} -jar '{parsemis_path}' --graphFile='{in_file}' --outputFile='{out_file}' --minimumFrequency={threshold} --maximumNodeCount={max_size} --minimumNodeCount={min_size} --algorithm=dagma --storeEmbeddings=true --distribution=threads --threads={nb_threads} --swapFile='{swap_file}'"
29+
#--closeGraph=true --zaretsky=true --subdue=true --singleRooted=true
30+
3031
print(f'Running Frequent Subgraph Mining for input folder: {in_folder}')
3132
# Currently only support for line graph
3233
for idx, in_file in enumerate([file_name for file_name in os.listdir(in_folder) if file_name.endswith('.lg')]):

mining/subgraph_statistics.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ def __init__(self, graph_db: List[nx.DiGraph], subgraphs: List[nx.DiGraph], labe
213213
self.occurrences_transaction = {}
214214
self.occurrences_embeddings = {}
215215
self.occurrences_references = {}
216+
self.compressions = {}
216217
self.lattice = lattice
217218

218219
def _set_label_name(self):
@@ -259,7 +260,8 @@ def compute_occurrences_lattice_based(self):
259260
self.occurrences_transaction[lattice_node.graph.name] = len(lattice_node.occurrences)
260261
# the occurrences of the nodes is just the id in the graph database, we want to get the names of the graphs instead
261262
self.occurrences_references[lattice_node.graph.name] = [self.graph_db[graph_id].name for graph_id in lattice_node.occurrences]
262-
263+
# absolute compresion (heuristic)
264+
self.compressions[lattice_node.graph.name] = (len(lattice_node.occurrences)-1) * (len(lattice_node.graph.nodes()) + len(lattice_node.graph.edges()))
263265

264266
def write_as_csv(self, save_path, additional_tag):
265267
'''
@@ -277,8 +279,8 @@ def write_as_csv(self, save_path, additional_tag):
277279
occurrences_embeddings = self.occurrences_embeddings[subgraph.name] if subgraph.name in self.occurrences_embeddings.keys() else 0
278280
occurrences_transaction = self.occurrences_transaction[subgraph.name] if subgraph.name in self.occurrences_transaction.keys() else 0
279281
occurrences_references = self.occurrences_references[subgraph.name] if subgraph.name in self.occurrences_references.keys() else []
280-
csvwriter.writerow([subgraph.name, additional_tag, occurrences_embeddings, occurrences_transaction, occurrences_references])
281-
282+
compression = self.compressions[subgraph.name] if subgraph.name in self.compressions.keys() else 0
283+
csvwriter.writerow([subgraph.name, additional_tag, occurrences_embeddings, occurrences_transaction, compression, occurrences_references])
282284

283285
def write_as_md(self, save_path, project_name):
284286
os.makedirs(os.path.dirname(save_path), exist_ok=True)
@@ -291,7 +293,7 @@ def write_as_md(self, save_path, project_name):
291293
occurrences_references = self.occurrences_references[subgraph.name] if subgraph.name in self.occurrences_references.keys() else []
292294
mdfile.write(f"Frequency: {occurrences_transaction}\n\n")
293295
mdfile.write("Put your notes here\n\n")
294-
mdfile.write("<details><summary>Matches</summar><p>\n")
296+
mdfile.write("<details><summary>Matches</summary><p>\n")
295297
for occurrence in occurrences_references:
296298
tokens = occurrence.split('$$$')
297299
if not len(tokens) >= 2:

0 commit comments

Comments
 (0)