Added analysis.py to the main flow of the program

Andrei-Constantin-Programmer · Andrei-Constantin-Programmer · commit 6be10cfe3e51 · 2025-01-04T18:44:56.000Z
diff --git a/src/analysis/analysis.py b/src/analysis/analysis.py
@@ -1,12 +1,17 @@
-from src.infrastructure.file_utils import read_csv
+import os
 import matplotlib.pyplot as plt
 import numpy as np
+from src.infrastructure import file_utils, repository_utils
 
-def create_size_impact_plot():
+def _save_plot(plot: plt, name: str):
+    file_path = os.path.join(file_utils.CHARTS_PATH, f"{name}.jpg")
+    plot.savefig(file_path)
+
+def _create_size_impact_plot():
     # 3. How does the size of a commit impact the results?
 
     # Read data from the repo_data csv
-    repo_data = read_csv("repo_data")
+    repo_data = file_utils.read_csv("repo_data")
 
     # Initialize arrays to store plot points
     x = []
@@ -19,6 +24,7 @@ def create_size_impact_plot():
 
         # Get the total number of tests for the repo
         total_test_count = int(repo['Test Before']) + int(repo['Test During']) + int(repo['Test After'])
+        total_test_count = 1 if total_test_count == 0 else total_test_count
 
         # Append the percentage of TDD for the repo to Y
         y.append((int(repo['Test Before'])/total_test_count)*100)
@@ -35,18 +41,16 @@ def create_size_impact_plot():
     plt.plot(x, a * x + b, color="red", alpha=0.5)
 
     # Set title and axes labels
-    plt.xlabel("Commit Count")
+    plt.xlabel("Repo Size (No. of files)")
     plt.ylabel("Percentage of TDD")
-    plt.title("Scatter plot showing how the size of commits impacts results")
-
-    # Show the plot
-    plt.show()
+    plt.title("Repo size and TDD percentage")
+    _save_plot(plt, "Size Impact")
 
-def create_box_plot():
+def _create_box_plot():
     # 2. How often is a test class (file) created (a) before, (b) after, or (c) in the same commit as a tested class (file)?
 
     # Read data from the repo_data csv
-    repo_data = read_csv("repo_data")
+    repo_data = file_utils.read_csv("repo_data")
 
     # Initialize arrays to store percentages for each repo
     before = []
@@ -57,58 +61,55 @@ def create_box_plot():
     for repo in repo_data:
         # Get the total number of tests for the repo
         total_test_count = int(repo['Test Before']) + int(repo['Test During']) + int(repo['Test After'])
+        total_test_count = 1 if total_test_count == 0 else total_test_count
 
         # Append the percentage data to each array
         before.append((int(repo['Test Before']) / total_test_count) * 100)
         during.append((int(repo['Test During']) / total_test_count) * 100)
         after.append((int(repo['Test After']) / total_test_count) * 100)
 
     # Plot the box plots
-    boxplt = plt.boxplot([before, during, after], patch_artist=True, tick_labels=["Before", "During", "After"], flierprops= dict(markerfacecolor='coral'))
+    boxplt = plt.boxplot([before, after, during], patch_artist=True, tick_labels=["Before", "After", "During"], flierprops= dict(markerfacecolor='coral'))
 
     colors = ['palegreen', 'lightblue', 'lightskyblue']
     for patch, color in zip(boxplt['boxes'], colors):
         patch.set_facecolor(color)
 
     # Set title and axes labels
     plt.ylabel("Percentage")
-    plt.title("Boxplot showing how often a test is created\nbefore, during and after implementation")
-
-    # Show the plot
-    plt.show()
+    plt.title("How often a test is created before, after and during implementation")
+    _save_plot(plt, "TDD Usage Statistics")
 
 
-def create_avg_commit_size_plot():
+def _create_avg_commit_size_plot():
     # Read data from the repo_data csv
-    repo_data = read_csv("repo_data")
+    repo_data = file_utils.read_csv("repo_data")
 
     # Initialize variables to store averages for each repo
     before_avg  = 0
-    during_avg = 0
     after_avg = 0
+    during_avg = 0
 
     # Iterate through each repo and update the before, during and after averages
     for repo in repo_data:
         before_avg = (before_avg + float(repo['Avg Before Commit Size'])) / 2
-        during_avg = (during_avg + float(repo['Avg During Commit Size'])) / 2
         after_avg = (after_avg + float(repo['Avg After Commit Size'])) / 2
+        during_avg = (during_avg + float(repo['Avg During Commit Size'])) / 2
 
     # Plot the bar chart
     colors = ['palegreen', 'lightblue', 'lightskyblue']
-    plt.bar(["Before", "During", "After"], [before_avg, during_avg, after_avg], align='center', color=colors)
+    plt.bar(["Before", "After", "During"], [before_avg, after_avg, during_avg], align='center', color=colors)
 
     # Set title and axes labels
     plt.xlabel("Commit relation between tests and implementation")
     plt.ylabel("Average Commit Size (No. of files)")
-    plt.title("Bar chart showing the average commit size when a\ntests are created before, during and after implementation")
-
-    # Show the plot
-    plt.show()
+    plt.title("Average commit size when tests are created \nbefore, after and during implementation")
+    _save_plot(plt, "Average Commit Size")
 
 
-def create_pie_plot():
+def _create_pie_plot():
     # Read data from the author_data csv
-    author_data = read_csv("author_data")
+    author_data = file_utils.read_csv("author_data")
 
     # Initialize Counters
     #10 25 50 70 90 100
@@ -117,7 +118,7 @@ def create_pie_plot():
 
     for author in author_data:
         # Calculate the percentage of TDD of the author
-        TDD_percent = (float(author['Test Before']) / (float(author['Test Before']) + float(author['Test During']) + float(author['Test After']))) * 100
+        TDD_percent = (float(author['Test Before']) / max(1, float(author['Test Before']) + float(author['Test During']) + float(author['Test After']))) * 100
 
         # Update the counters array based on this result
         if TDD_percent < 10:
@@ -134,7 +135,7 @@ def create_pie_plot():
             counters[5] += 1
 
     # Convert the counters into percentages using a lambda function and map
-    percentages = list(map(lambda x: x/len(author_data)*100, counters))
+    percentages = list(map(lambda x: x/max(1, len(author_data))*100, counters))
 
     labels = ['Non TDD', 'Rarely TDD', 'Occasionally TDD', 'Somewhat TDD', 'Mostly TDD', 'Consistently TDD']
     for i in range(len(labels)):
@@ -148,13 +149,11 @@ def create_pie_plot():
     plt.legend(patches, labels, loc="upper left")
     plt.axis('equal')
     plt.title("Pie chart showing the percentage of authors using levels of TDD")
-
-    # Show the plot
-    plt.show()
-
-create_size_impact_plot()
-create_box_plot()
-create_avg_commit_size_plot()
-create_pie_plot()
+    _save_plot(plt, "TDD Categories")
 
 
+def create_plots():
+    _create_size_impact_plot()
+    _create_box_plot()
+    _create_avg_commit_size_plot()
+    _create_pie_plot()
diff --git a/src/infrastructure/configuration.py b/src/infrastructure/configuration.py
@@ -11,6 +11,7 @@
 
 def setup_directories():
     file_utils.create_directory(file_utils.RESULTS_PATH, delete_existing=True)
+    file_utils.create_directory(file_utils.CHARTS_PATH)
     file_utils.create_directory(file_utils.LOGS_PATH)
     file_utils.create_directory(file_utils.COMMITS_PATH)
 
diff --git a/src/infrastructure/file_utils.py b/src/infrastructure/file_utils.py
@@ -6,6 +6,7 @@
 ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
 RESOURCES_PATH = os.path.join(ROOT_PATH, "resources", "repositories")
 RESULTS_PATH = os.path.join(ROOT_PATH, "results")
+CHARTS_PATH = os.path.join(RESULTS_PATH, "charts")
 LOGS_PATH = os.path.join(ROOT_PATH, "logs")
 COMMITS_PATH = os.path.join(ROOT_PATH, "commits")
 
diff --git a/src/presentation/analysis_manager.py b/src/presentation/analysis_manager.py
@@ -9,6 +9,7 @@
 from src.mining import commit_retrieval as retrieval
 from src.mining.csv_export import update_author_count, update_author_data, update_repo_data, anonymyse_authors
 from src.models.Repository import Repository
+from src.analysis import analysis
 
 class AnalysisManager():
     def __init__(self, date_of_experiment: datetime):
@@ -114,10 +115,12 @@ async def process_and_update(repo):
     async def perform_analysis_on_repo(self, repo: Repository, file_handler: LanguageFileHandler, force_mine: bool):
         await self._process_repositories([repo], file_handler, batch_size=1, force_mine=force_mine)
         anonymyse_authors()
+        analysis.create_plots()
 
     async def perform_analysis(self, file_handlers: list, batch_size: int, force_mine: bool):
         for file_handler in file_handlers:
             repositories = repository_utils.read_repositories(file_handler.name.lower())
             await self._process_repositories(repositories, file_handler, batch_size, force_mine)
 
-        anonymyse_authors()
+        anonymyse_authors()
+        analysis.create_plots()