Merge pull request #17 from OPPIDA/feat/dataset-juliet-test-suite-c

nolliv22 · web-flow · commit 9639556e5b65 · 2025-11-17T14:06:18.000+01:00
diff --git a/codesectools/datasets/JulietTestSuiteC/__init__.py b/codesectools/datasets/JulietTestSuiteC/__init__.py
@@ -0,0 +1 @@
+"""Initializes the JulietTestSuiteC dataset module."""
diff --git a/codesectools/datasets/JulietTestSuiteC/dataset.py b/codesectools/datasets/JulietTestSuiteC/dataset.py
@@ -0,0 +1,160 @@
+"""Defines the JulietTestSuiteC dataset for evaluating SAST tools on C code.
+
+This module provides the classes and logic to load the Juliet Test Suite for C/C++,
+which consists of C test files with known vulnerabilities. It downloads the source code
+from the NIST Software Assurance Reference Dataset (SARD) and parses an XML manifest
+to associate test files with expected results.
+"""
+
+import io
+import re
+import shutil
+import zipfile
+from pathlib import Path
+from typing import Self
+
+import requests
+from lxml import etree
+
+from codesectools.datasets.core.dataset import File, PrebuiltFileDataset
+from codesectools.shared.cwe import CWE, CWEs
+from codesectools.utils import CPU_COUNT
+
+
+class TestCode(File):
+    """Represents a single test file in the JulietTestSuiteC dataset."""
+
+    def __init__(
+        self,
+        filepath: Path,
+        content: str | bytes,
+        cwes: list[CWE],
+        has_vuln: bool,
+    ) -> None:
+        """Initialize a TestCode instance.
+
+        Args:
+            filepath: The path to the file.
+            content: The content of the file, as a string or bytes.
+            cwes: A list of CWEs associated with the file.
+            has_vuln: A boolean indicating if the vulnerability is real or a false positive test case.
+
+        """
+        super().__init__(
+            filepath=filepath, content=content, cwes=cwes, has_vuln=has_vuln
+        )
+
+
+class JulietTestSuiteC(PrebuiltFileDataset):
+    """Represents the Juliet Test Suite for C/C++.
+
+    This class handles downloading, extracting, and loading the C/C++ test cases
+    from the Juliet Test Suite.
+    """
+
+    name = "JulietTestSuiteC"
+    supported_languages = ["c"]
+    license = "CC0 1.0 Universal"
+    license_url = "https://data.niaid.nih.gov/resources?id=zenodo_4701386#description"
+
+    build_command = f"bear -- make -C ./C individuals -j{CPU_COUNT}"
+    prebuilt_expected = (Path("."), "compile_commands.json")
+    artefacts_arg = "compile_commands.json"
+
+    def __init__(self, lang: None | str = None) -> None:
+        """Initialize the JulietTestSuiteC dataset.
+
+        Args:
+            lang: The programming language of the dataset files.
+                Must be one of the supported languages.
+
+        """
+        super().__init__(lang)
+
+    def __eq__(self, other: str | Self) -> bool:
+        """Compare this dataset with another object for equality.
+
+        Args:
+            other: The object to compare with. Can be a string (dataset name)
+                   or another JulietTestSuiteC instance.
+
+        Returns:
+            True if the names are equal, False otherwise.
+
+        """
+        if isinstance(other, str):
+            return self.name == other
+        elif isinstance(other, self.__class__):
+            return self.name == other.name
+        else:
+            return False
+
+    def download_files(self: Self, test: bool = False) -> None:
+        """Download and extract the dataset from the NIST SARD website.
+
+        Downloads the zip archive, extracts its contents, and prunes the test cases
+        to a smaller subset for faster processing. If in test mode, it further
+        reduces the dataset to only a single CWE.
+
+        Args:
+            test: If True, reduce the number of test files for faster testing.
+
+        """
+        zip_file = io.BytesIO(
+            requests.get(
+                "https://samate.nist.gov/SARD/downloads/test-suites/2017-10-01-juliet-test-suite-for-c-cplusplus-v1-3.zip"
+            ).content
+        )
+        with zipfile.ZipFile(zip_file, "r") as zip_ref:
+            zip_ref.extractall(self.directory)
+
+        # Limit to one set for each CWE
+        testcases = self.directory / "C" / "testcases"
+        for set_dir in testcases.glob("CWE*/s*"):
+            if set_dir.name != "s01":
+                shutil.move(set_dir, set_dir.parent / f"_{set_dir.name}")
+
+        if test:
+            for cwe_dir in list(testcases.glob("CWE*")):
+                if not cwe_dir.name.startswith("CWE835"):
+                    shutil.rmtree(cwe_dir)
+
+    def load_dataset(self) -> list[TestCode]:
+        """Load the JulietTestSuiteC dataset from the source files.
+
+        Parses the `manifest.xml` file to identify vulnerabilities in the C/C++
+        source files and creates a `TestCode` object for each file containing a flaw.
+
+        Returns:
+            A list of `TestCode` objects representing the dataset.
+
+        """
+        files = []
+        testcode_dir = self.directory / "C" / "testcases"
+        testcode_paths = {
+            path.name: path
+            for path in list(testcode_dir.rglob("CWE*.c"))
+            + list(testcode_dir.rglob("CWE*.cpp"))
+        }
+        manifest_path = self.directory / "C" / "manifest.xml"
+        manifest = etree.parse(manifest_path)
+        testcases = manifest.xpath("/container/testcase")
+        for testcase in testcases:
+            files_tree = testcase.xpath("file")
+            for file_tree in files_tree:
+                file_path = file_tree.get("path")
+                if file_obj := testcode_paths.get(file_path):
+                    if file_tree.xpath("flaw"):
+                        flaw = file_tree.xpath("flaw")[0]
+                        flaw_name = flaw.get("name")
+                        if m := re.search(r"CWE-(\d+)", flaw_name):
+                            cwe_id = int(m.group(1))
+                            files.append(
+                                TestCode(
+                                    filepath=file_obj.relative_to(self.directory),
+                                    content=file_obj.read_bytes(),
+                                    cwes=[CWEs.from_id(cwe_id)],
+                                    has_vuln=True,
+                                )
+                            )
+        return files
diff --git a/codesectools/sasts/tools/Cppcheck/sast.py b/codesectools/sasts/tools/Cppcheck/sast.py
@@ -36,7 +36,7 @@ class CppcheckSAST(PrebuiltBuildlessSAST):
 
     name = "Cppcheck"
     supported_languages = ["c"]
-    supported_dataset_names = []
+    supported_dataset_names = ["JulietTestSuiteC"]
     properties = SASTProperties(free=True, offline=True)
     requirements = SASTRequirements(
         full_reqs=[
diff --git a/codesectools/sasts/tools/SemgrepCE/sast.py b/codesectools/sasts/tools/SemgrepCE/sast.py
@@ -37,7 +37,7 @@ class SemgrepCESAST(BuildlessSAST):
 
     name = "SemgrepCE"
     supported_languages = ["java", "c"]
-    supported_dataset_names = ["BenchmarkJava", "CVEfixes"]
+    supported_dataset_names = ["BenchmarkJava", "CVEfixes", "JulietTestSuiteC"]
     properties = SASTProperties(free=True, offline=True)
     requirements = SASTRequirements(
         full_reqs=[
diff --git a/codesectools/sasts/tools/SnykCode/sast.py b/codesectools/sasts/tools/SnykCode/sast.py
@@ -33,7 +33,7 @@ class SnykCodeSAST(BuildlessSAST):
 
     name = "SnykCode"
     supported_languages = ["java", "c"]
-    supported_dataset_names = ["BenchmarkJava", "CVEfixes"]
+    supported_dataset_names = ["BenchmarkJava", "CVEfixes", "JulietTestSuiteC"]
     properties = SASTProperties(free=False, offline=False)
     requirements = SASTRequirements(
         full_reqs=[
diff --git a/docs/dataset/profiles/juliettestsuitec.yaml b/docs/dataset/profiles/juliettestsuitec.yaml
@@ -0,0 +1,20 @@
+name: Juliet Test Suite for C/C++ v1.3
+description: A collection of test cases in the C/C++ language. It contains examples organized under 118 different CWEs. Version 1.3 adds test cases for increment and decrement and fixes some dozen systematic problems in 1.2 cases.
+type: File
+url: https://data.niaid.nih.gov/resources?id=zenodo_4701386#description
+supported_version: Latest
+supported_languages:
+  - C/C++
+legal:
+  license: CC0 1.0 Universal
+  license_type: Public Domain
+  license_url: https://data.niaid.nih.gov/resources?id=zenodo_4701386#description
+requirements:
+  - An internet connection is required **only** to download the dataset.
+extra: |
+  !!! info "Dataset content"
+    
+      - Test files: `C/testcases/CWE*/**`
+      - Labeled data: `C/manifest.xml`
+
+      *Downloaded from [NIST SARD](https://samate.nist.gov/SARD/downloads/test-suites/2017-10-01-juliet-test-suite-for-c-cplusplus-v1-3.zip).*
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "CodeSecTools"
-version = "0.12.4"
+version = "0.13.0"
 description = "A framework for code security that provides abstractions for static analysis tools and datasets to support their integration, testing, and evaluation."
 readme = "README.md"
 license = "AGPL-3.0-only"
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""Initializes the JulietTestSuiteC dataset module."""`