Skip to content

Commit 9167760

Browse files
authored
Merge pull request #14 from OPPIDA/feat/sasts-analyze
2 parents 9457a4e + 78e0a0c commit 9167760

5 files changed

Lines changed: 92 additions & 82 deletions

File tree

codesectools/datasets/BenchmarkJava/dataset.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ class BenchmarkJava(PrebuiltFileDataset):
5454
Attributes:
5555
name (str): The name of the dataset, "BenchmarkJava".
5656
supported_languages (list[str]): A list of supported programming languages.
57+
license (str): The license under which the dataset is distributed.
58+
license_url (str): A URL to the full text of the license.
59+
build_command (str): The command to build the Java project.
60+
prebuilt_expected (tuple): A tuple defining the path and glob pattern for expected build artifacts.
61+
artefacts_arg (str): The argument to specify the location of build artifacts for SAST tools.
5762
5863
"""
5964

@@ -64,6 +69,7 @@ class BenchmarkJava(PrebuiltFileDataset):
6469

6570
build_command = "mvn clean compile"
6671
prebuilt_expected = (Path("target/classes/org/owasp/benchmark/testcode"), "*.class")
72+
artefacts_arg = "."
6773

6874
def __init__(self, lang: None | str = None) -> None:
6975
"""Initialize the BenchmarkJava dataset.

codesectools/datasets/core/dataset.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,19 @@ def list_dataset_full_names(cls) -> list[str]:
150150

151151

152152
class PrebuiltDatasetMixin:
153-
"""Provide functionality for datasets that require a build step."""
153+
"""Provide functionality for datasets that require a build step.
154+
155+
Attributes:
156+
build_command (str): The command required to build the dataset.
157+
prebuilt_expected (tuple[Path, str]): A tuple containing the path and glob pattern
158+
to find the built artifacts.
159+
artefacts_arg (str): The argument to pass to the SAST tool command template.
160+
161+
"""
154162

155163
build_command: str
156164
prebuilt_expected: tuple[Path, str]
165+
artefacts_arg: str
157166

158167
def is_built(self) -> bool:
159168
"""Check if the dataset has been built."""
@@ -271,15 +280,7 @@ def save(self, dir: Path) -> None:
271280

272281

273282
class FileDataset(Dataset):
274-
"""Abstract base class for datasets composed of individual files.
275-
276-
Attributes:
277-
directory (Path): The directory path for the dataset.
278-
lang (str): The programming language of the dataset.
279-
full_name (str): The full name of the dataset, including the language.
280-
files (list[File]): A list of `File` objects loaded from the dataset.
281-
282-
"""
283+
"""Abstract base class for datasets composed of individual files."""
283284

284285
def __init__(self, lang: str) -> None:
285286
"""Initialize a FileDataset instance.

codesectools/sasts/core/sast/__init__.py

Lines changed: 26 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
from pathlib import Path
1616
from typing import Any, Literal, Union
1717

18-
import git
1918
from rich import print
2019
from rich.panel import Panel
2120
from rich.progress import Progress
@@ -123,9 +122,15 @@ def run_analysis(
123122
render_variables[to_replace] = v
124123
elif isinstance(v, Path):
125124
render_variables[to_replace] = str(v.resolve())
125+
elif isinstance(v, list):
126+
render_variables[to_replace] = v
126127
else:
127128
raise NotImplementedError(k, v)
128129

130+
# Make temporary directory available to command
131+
temp_dir = tempfile.TemporaryDirectory()
132+
render_variables["{tempdir}"] = temp_dir.name
133+
129134
with Progress() as progress:
130135
progress.add_task(
131136
f"[b][{self.name}][/b] analyzing: [i]{project_dir.name}[/i]",
@@ -165,7 +170,7 @@ def save_results(self, project_dir: Path, output_dir: Path, extra: dict) -> None
165170
166171
"""
167172
output_dir.mkdir(exist_ok=True, parents=True)
168-
json.dump(extra, (output_dir / "cstools_output.json").open("w"))
173+
json.dump(extra, (output_dir / "cstools_output.json").open("w"), indent=4)
169174

170175
missing_files = []
171176
for path_from_root, required in self.output_files:
@@ -175,7 +180,7 @@ def save_results(self, project_dir: Path, output_dir: Path, extra: dict) -> None
175180
filepath = project_dir / parent_dir / filename
176181
if filepath.is_file():
177182
if not filepath == output_dir / filename:
178-
shutil.copy2(filepath, output_dir / filename)
183+
filepath.rename(output_dir / filename)
179184
else:
180185
if required:
181186
missing_files.append(filename)
@@ -184,7 +189,7 @@ def save_results(self, project_dir: Path, output_dir: Path, extra: dict) -> None
184189
if filepaths:
185190
for filepath in filepaths:
186191
if not filepath == output_dir / filename:
187-
shutil.copy2(filepath, output_dir / filepath.name)
192+
filepath.rename(output_dir / filepath.name)
188193
else:
189194
if required:
190195
missing_files.append(filename)
@@ -218,25 +223,7 @@ def analyze_files(
218223
)
219224
return
220225

221-
# Create temporary directory for the project
222-
temp_dir = tempfile.TemporaryDirectory()
223-
temp_path = Path(temp_dir.name)
224-
225-
# Copy files into the temporary directory
226-
if testing:
227-
random.seed(os.environ.get("CONSTANT_RANDOM", os.urandom(16)))
228-
files = random.sample(dataset.files, k=2)
229-
else:
230-
files = dataset.files
231-
232-
for file in files:
233-
file.save(temp_path)
234-
235-
# Run analysis
236-
self.run_analysis(dataset.lang, temp_path, result_path)
237-
238-
# Clear temporary directory
239-
temp_dir.cleanup()
226+
self.run_analysis(dataset.lang, dataset.directory, result_path)
240227

241228
def analyze_repos(
242229
self, dataset: GitRepoDataset, overwrite: bool = False, testing: bool = False
@@ -252,8 +239,8 @@ def analyze_repos(
252239
testing: If True, run analysis on a sample of two small random repositories for testing purposes.
253240
254241
"""
255-
base_result_path = self.output_dir / dataset.full_name
256-
base_result_path.mkdir(exist_ok=True, parents=True)
242+
result_path = self.output_dir / dataset.full_name
243+
result_path.mkdir(exist_ok=True, parents=True)
257244

258245
if testing:
259246
random.seed(os.environ.get("CONSTANT_RANDOM", os.urandom(16)))
@@ -263,27 +250,22 @@ def analyze_repos(
263250
repos = dataset.repos
264251

265252
for repo in repos:
266-
result_path = base_result_path / repo.name
267-
if result_path.is_dir():
268-
if list(result_path.iterdir()) and not overwrite:
253+
repo_result_path = result_path / repo.name
254+
if repo_result_path.is_dir():
255+
if list(repo_result_path.iterdir()) and not overwrite:
269256
print(f"Results already exist for {repo.name}, skipping...")
270257
print("Please use --overwrite to analyze again")
258+
continue
271259

272-
# Create temporary directory for the project
273-
temp_dir = tempfile.TemporaryDirectory()
274-
repo_path = Path(temp_dir.name)
260+
repo_source_path = dataset.directory / repo.name
275261

276-
# Clone and checkout to the vulnerable commit
277-
try:
278-
repo.save(repo_path)
279-
except git.GitCommandError:
280-
continue
262+
if repo_source_path.is_dir():
263+
shutil.rmtree(repo_source_path)
281264

282-
# Run analysis
283-
self.run_analysis(dataset.lang, repo_path, result_path)
265+
repo_source_path.mkdir()
266+
repo.save(repo_source_path)
284267

285-
# Clear temporary directory
286-
temp_dir.cleanup()
268+
self.run_analysis(dataset.lang, repo_source_path, repo_result_path)
287269

288270
@property
289271
def supported_dataset_full_names(self) -> list[str]:
@@ -399,28 +381,13 @@ def analyze_files(
399381
)
400382
return
401383

402-
# Create temporary directory for the project
403-
temp_dir = tempfile.TemporaryDirectory()
404-
temp_path = Path(temp_dir.name)
405-
406-
# Copy files into the temporary directory
407-
if testing:
408-
random.seed(os.environ.get("CONSTANT_RANDOM", os.urandom(16)))
409-
prebuilt_files = random.sample(dataset.list_prebuilt_files(), k=2)
410-
else:
411-
prebuilt_files = dataset.list_prebuilt_files()
412-
413-
for prebuilt_file in prebuilt_files:
414-
shutil.copy2(prebuilt_file, temp_path / prebuilt_file.name)
415-
416-
# Run analysis
417384
self.run_analysis(
418-
dataset.lang, dataset.directory, result_path, artifacts=temp_path
385+
dataset.lang,
386+
dataset.directory,
387+
result_path,
388+
artifacts=dataset.artefacts_arg,
419389
)
420390

421-
# Clear temporary directory
422-
temp_dir.cleanup()
423-
424391

425392
class PrebuiltBuildlessSAST(PrebuiltSAST, BuildlessSAST):
426393
"""Represent a SAST tool that can analyze both source code and pre-built artifacts."""

codesectools/sasts/tools/Cppcheck/sast.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
SASTRequirements,
1414
)
1515
from codesectools.sasts.tools.Cppcheck.parser import CppcheckAnalysisResult
16+
from codesectools.utils import CPU_COUNT
1617

1718

1819
class CppcheckSAST(PrebuiltBuildlessSAST):
@@ -50,6 +51,8 @@ class CppcheckSAST(PrebuiltBuildlessSAST):
5051
"--enable=all",
5152
"--xml",
5253
"--output-file=cppcheck_output.xml",
54+
"--cppcheck-build-dir={tempdir}",
55+
f"-j{CPU_COUNT}",
5356
]
5457
]
5558
valid_codes = [0]

codesectools/utils.py

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"""
77

88
import os
9+
import re
910
import subprocess
1011
from collections.abc import Sequence
1112
from importlib.resources import files
@@ -39,31 +40,60 @@ def DEBUG() -> bool:
3940

4041

4142
# Subprocess wrapper
42-
def render_command(command: list[str], map: dict[str, str]) -> list[str]:
43+
def get_pattern(arg: str, mapping: dict[str, str]) -> str | None:
44+
"""Find a placeholder pattern like '{placeholder}' in an argument string.
45+
46+
Args:
47+
arg: The string to search for a pattern.
48+
mapping: A dictionary of placeholders, kept for contextual consistency
49+
with `render_command`.
50+
51+
Returns:
52+
The found pattern string (e.g., '{placeholder}') or None if not found.
53+
54+
"""
55+
if m := re.search(r"\{.*\}", arg):
56+
return m.group(0)
57+
58+
59+
def render_command(command: list, mapping: dict[str, str]) -> list[str]:
4360
"""Render a command template by replacing placeholders with values.
4461
62+
Substitutes placeholders in a command list from a given map. It handles
63+
simple string arguments and optional arguments represented as tuples.
64+
If a mapped value is a list, the argument is expanded.
65+
4566
Args:
46-
command: The command template as a list of strings.
47-
map: A dictionary of placeholders to their replacement values.
67+
command: The command template, which can contain strings and tuples
68+
of the form `(default, optional_template)`.
69+
mapping: A dictionary of placeholders to their replacement values.
4870
4971
Returns:
5072
The rendered command as a list of strings.
5173
5274
"""
5375
_command = command.copy()
54-
for pattern, value in map.items():
55-
for i, arg in enumerate(_command):
56-
# Check if optional argument can be used
57-
if isinstance(arg, tuple):
58-
default_arg, optional_arg = arg
59-
if pattern in optional_arg:
60-
_command[i] = arg.replace(pattern, value)
76+
for i, arg in enumerate(_command):
77+
# Check if optional argument can be used
78+
if isinstance(arg, tuple):
79+
default_arg, optional_arg = arg
80+
81+
if pattern := get_pattern(optional_arg, mapping):
82+
_command[i] = optional_arg.replace(pattern, mapping[pattern])
83+
elif pattern := get_pattern(default_arg, mapping):
84+
_command[i] = default_arg.replace(pattern, mapping[pattern])
85+
else:
86+
if pattern := get_pattern(arg, mapping):
87+
value = mapping[pattern]
88+
if isinstance(value, list):
89+
_command[i] = " ".join(
90+
arg.replace(pattern, subvalue) for subvalue in value
91+
)
6192
else:
62-
_command[i] = default_arg
63-
else:
64-
if pattern in arg:
6593
_command[i] = arg.replace(pattern, value)
6694

95+
_command = " ".join(_command).split(" ")
96+
6797
# Remove not rendered part of the command:
6898
__command = []
6999
for part in _command:
@@ -193,3 +223,6 @@ def shorten_path(p: str) -> str:
193223
if len(path.parts) > 3:
194224
return str(Path("...") / path.parts[-2] / path.parts[-1])
195225
return p
226+
227+
228+
CPU_COUNT = os.cpu_count()

0 commit comments

Comments
 (0)