Skip to content

Commit 662baa8

Browse files
authored
Merge pull request #10 from OPPIDA/feat/sast-filepath
2 parents 6467403 + 6bf8958 commit 662baa8

16 files changed

Lines changed: 106 additions & 62 deletions

File tree

codesectools/datasets/BenchmarkJava/dataset.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class TestCode(File):
2929

3030
def __init__(
3131
self,
32-
filename: str,
32+
filepath: Path,
3333
content: str | bytes,
3434
cwes: list[CWE],
3535
vuln_type: str,
@@ -38,15 +38,15 @@ def __init__(
3838
"""Initialize a TestCode instance.
3939
4040
Args:
41-
filename: The name of the file.
41+
filepath: The path to the file.
4242
content: The content of the file, as a string or bytes.
4343
cwes: A list of CWEs associated with the file.
4444
vuln_type: The type of vulnerability.
4545
has_vuln: A boolean indicating if the vulnerability is real or a false positive test case.
4646
4747
"""
4848
super().__init__(
49-
filename=filename, content=content, cwes=cwes, has_vuln=has_vuln
49+
filepath=filepath, content=content, cwes=cwes, has_vuln=has_vuln
5050
)
5151

5252
self.vuln_type = vuln_type
@@ -148,10 +148,19 @@ def load_dataset(self) -> list[TestCode]:
148148
next(reader)
149149
for row in reader:
150150
filename = f"{row[0]}.java"
151-
content = (testcode_dir / filename).read_text()
151+
filepath = testcode_dir / filename
152+
content = filepath.read_text()
152153
cwes = [CWEs.from_id(int(row[3]))]
153154
vuln_type = row[1]
154155
has_vuln = True if row[2] == "true" else False
155-
files.append(TestCode(filename, content, cwes, vuln_type, has_vuln))
156+
files.append(
157+
TestCode(
158+
filepath.relative_to(self.directory),
159+
content,
160+
cwes,
161+
vuln_type,
162+
has_vuln,
163+
)
164+
)
156165

157166
return files

codesectools/datasets/core/dataset.py

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from __future__ import annotations
1010

1111
from abc import ABC, abstractmethod
12+
from pathlib import Path
1213
from typing import TYPE_CHECKING
1314

1415
import git
@@ -21,7 +22,6 @@
2122
from codesectools.utils import USER_CACHE_DIR
2223

2324
if TYPE_CHECKING:
24-
from pathlib import Path
2525
from typing import Self
2626

2727
from codesectools.sasts.core.parser import AnalysisResult, Defect
@@ -197,7 +197,7 @@ class File(DatasetUnit):
197197
"""Represent a single file in a dataset.
198198
199199
Attributes:
200-
filename (str): The name of the file.
200+
filepath (Path): The relative path to the file.
201201
content (bytes): The byte content of the file.
202202
cwes (list[CWE]): A list of CWEs associated with the file.
203203
has_vuln (bool): True if the vulnerability is real, False if it's
@@ -206,20 +206,21 @@ class File(DatasetUnit):
206206
"""
207207

208208
def __init__(
209-
self, filename: str, content: str | bytes, cwes: list[CWE], has_vuln: bool
209+
self, filepath: Path, content: str | bytes, cwes: list[CWE], has_vuln: bool
210210
) -> None:
211211
"""Initialize a File instance.
212212
213213
Args:
214-
filename: The name of the file.
214+
filepath: The relative path of the file.
215215
content: The content of the file, as a string or bytes. It will be
216216
converted to bytes if provided as a string.
217217
cwes: A list of CWEs associated with the file.
218218
has_vuln: True if the vulnerability is real, False if it's
219219
intended to be a false positive test case.
220220
221221
"""
222-
self.filename = filename
222+
self.filepath = filepath
223+
self.filename = self.filepath.name
223224
self.content = content
224225
self.cwes = cwes
225226
self.has_vuln = has_vuln
@@ -231,29 +232,29 @@ def __repr__(self) -> str:
231232
"""Return a developer-friendly string representation of the File.
232233
233234
Returns:
234-
A string showing the class name, filename, and CWE IDs.
235+
A string showing the class name, filepath, and CWE IDs.
235236
236237
"""
237238
return f"""{self.__class__.__name__}(
238-
filename: \t{self.filename}
239+
filepath: \t{self.filepath}
239240
cwes: \t{self.cwes}
240241
)"""
241242

242-
def __eq__(self, other: str | Self) -> bool:
243-
"""Compare this File with another object for equality based on filename.
243+
def __eq__(self, other: str | Path | Self) -> bool:
244+
"""Compare this File with another object for equality based on filepath.
244245
245246
Args:
246-
other: The object to compare with. Can be a string (filename) or
247+
other: The object to compare with. Can be a string/Path (filepath) or
247248
another File instance.
248249
249250
Returns:
250-
True if the filenames are equal, False otherwise.
251+
True if the filepaths are equal, False otherwise.
251252
252253
"""
253-
if isinstance(other, str):
254-
return self.filename == other
254+
if isinstance(other, (str, Path)):
255+
return self.filepath == Path(other)
255256
elif isinstance(other, self.__class__):
256-
return self.filename == other.filename
257+
return self.filepath == other.filepath
257258
else:
258259
return False
259260

@@ -264,7 +265,9 @@ def save(self, dir: Path) -> None:
264265
dir: The path to the directory where the file should be saved.
265266
266267
"""
267-
(dir / self.filename).write_bytes(self.content)
268+
target_path = dir / self.filepath
269+
target_path.parent.mkdir(parents=True, exist_ok=True)
270+
target_path.write_bytes(self.content)
268271

269272

270273
class FileDataset(Dataset):
@@ -303,7 +306,7 @@ def validate(self, analysis_result: AnalysisResult) -> FileDatasetData:
303306
"""
304307
# 1. Prepare ground truth from all files in the dataset
305308
ground_truth: dict[str, tuple[bool, set[CWE]]] = {
306-
file.filename: (file.has_vuln, set(file.cwes)) for file in self.files
309+
str(file.filepath): (file.has_vuln, set(file.cwes)) for file in self.files
307310
}
308311

309312
# 2. Process reported defects to get unique (file, cwe) pairs
@@ -313,32 +316,32 @@ def validate(self, analysis_result: AnalysisResult) -> FileDatasetData:
313316
if not defect.cwe or defect.cwe.id == -1:
314317
continue
315318

316-
file_cwe_pair = (defect.filename, defect.cwe)
319+
file_cwe_pair = (str(defect.filepath), defect.cwe)
317320
if file_cwe_pair not in unique_reported_defects:
318321
unique_reported_defects[file_cwe_pair] = defect
319322

320323
# 3. Classify unique reported vulnerabilities as TP or FP
321324
tp_defects_map: dict[tuple[str, CWE], Defect] = {}
322325
fp_defects_map: dict[tuple[str, CWE], Defect] = {}
323326

324-
for (filename, cwe), defect in unique_reported_defects.items():
325-
has_vuln, expected_cwes = ground_truth.get(filename, (False, set()))
327+
for (filepath, cwe), defect in unique_reported_defects.items():
328+
has_vuln, expected_cwes = ground_truth.get(filepath, (False, set()))
326329

327330
if has_vuln and cwe in expected_cwes:
328331
# Correctly identified a vulnerability
329-
tp_defects_map[(filename, cwe)] = defect
332+
tp_defects_map[(filepath, cwe)] = defect
330333
else:
331334
# Reported a vuln in a non-vulnerable file, with wrong CWE,
332335
# or in a file not part of the dataset.
333-
fp_defects_map[(filename, cwe)] = defect
336+
fp_defects_map[(filepath, cwe)] = defect
334337

335338
# 4. Determine False Negatives by finding what was missed from the ground truth.
336339
fn_defects_set: set[tuple[str, CWE]] = set()
337-
for filename, (has_vuln, expected_cwes) in ground_truth.items():
340+
for filepath, (has_vuln, expected_cwes) in ground_truth.items():
338341
if has_vuln:
339342
for expected_cwe in expected_cwes:
340-
if (filename, expected_cwe) not in tp_defects_map:
341-
fn_defects_set.add((filename, expected_cwe))
343+
if (filepath, expected_cwe) not in tp_defects_map:
344+
fn_defects_set.add((filepath, expected_cwe))
342345

343346
# 5. Convert maps and sets to lists of objects for downstream use
344347
tp_defects = list(tp_defects_map.values())
@@ -354,7 +357,7 @@ def validate(self, analysis_result: AnalysisResult) -> FileDatasetData:
354357
fp_cwes = [cwe for _, cwe in fp_defects_map.keys()]
355358
fn_cwes = [cwe for _, cwe in fn_defects_set]
356359

357-
unique_correct_number = len({filename for filename, _ in tp_defects_map.keys()})
360+
unique_correct_number = len({filepath for filepath, _ in tp_defects_map.keys()})
358361

359362
return FileDatasetData(
360363
dataset=self,

codesectools/sasts/all/cli.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,13 @@ def benchmark(
119119
dataset: Annotated[
120120
str,
121121
typer.Argument(
122-
click_type=Choice([d.name for d in all_sast.sasts_by_dataset]),
122+
click_type=Choice(
123+
[
124+
f"{d.name}_{lang}"
125+
for d in all_sast.sasts_by_dataset
126+
for lang in d.supported_languages
127+
]
128+
),
123129
metavar="DATASET",
124130
),
125131
],
@@ -140,7 +146,7 @@ def benchmark(
140146
) -> None:
141147
"""Run a benchmark on a dataset using all available SAST tools."""
142148
dataset_name, lang = dataset.split("_")
143-
for sast in all_sast.sasts_by_dataset.get(lang, []):
149+
for sast in all_sast.sasts_by_dataset.get(DATASETS_ALL[dataset_name], []):
144150
dataset = DATASETS_ALL[dataset_name](lang)
145151
if isinstance(dataset, FileDataset):
146152
sast.analyze_files(dataset, overwrite, testing)

codesectools/sasts/all/graphics.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import shutil
44
import tempfile
5-
from pathlib import Path
65

76
import matplotlib
87
import matplotlib.pyplot as plt
@@ -11,6 +10,7 @@
1110
from rich import print
1211

1312
from codesectools.sasts.all.sast import AllSAST
13+
from codesectools.utils import shorten_path
1414

1515
## Matplotlib config
1616
matplotlib.rcParams.update(
@@ -107,7 +107,7 @@ def __init__(self, project_name: str) -> None:
107107
def plot_overview(self) -> Figure:
108108
"""Generate an overview plot with stats by files, SAST tools, and categories."""
109109
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, layout="constrained")
110-
by_files = {Path(k).name: v for k, v in self.result.stats_by_files().items()}
110+
by_files = self.result.stats_by_files()
111111
by_sasts = self.result.stats_by_sasts()
112112
by_categories = self.result.stats_by_categories()
113113

@@ -117,7 +117,7 @@ def plot_overview(self) -> Figure:
117117
list(by_files.items()), key=lambda e: e[1]["count"], reverse=True
118118
)
119119
for k, v in sorted_files[: self.limit]:
120-
X_files.append(k)
120+
X_files.append(shorten_path(k))
121121
Y_files.append(v["count"])
122122

123123
COLORS_COUNT = {v: 0 for k, v in self.color_mapping.items()}
@@ -130,11 +130,11 @@ def plot_overview(self) -> Figure:
130130
current_height = 0
131131
for color, height in COLORS_COUNT.items():
132132
if height > 0:
133-
bars.append((k, current_height + height, color))
133+
bars.append((shorten_path(k), current_height + height, color))
134134
current_height += height
135135

136-
for k, height, color in bars[::-1]:
137-
ax1.bar(k, height, color=color)
136+
for k_short, height, color in bars[::-1]:
137+
ax1.bar(k_short, height, color=color)
138138

139139
ax1.set_xticks(X_files, X_files, rotation=45, ha="right")
140140
ax1.set_title(f"Stats by files (limit to {self.limit})")
@@ -231,7 +231,7 @@ def plot_top_cwes(self) -> Figure:
231231
def plot_top_scores(self) -> Figure:
232232
"""Generate a stacked bar plot for files with the highest scores."""
233233
fig, ax = plt.subplots(1, 1, layout="constrained")
234-
by_scores = {Path(k).name: v for k, v in self.result.stats_by_scores().items()}
234+
by_scores = self.result.stats_by_scores()
235235

236236
for file, data in by_scores.items():
237237
by_scores[file]["total_score"] = sum(data["score"].values())
@@ -244,7 +244,7 @@ def plot_top_scores(self) -> Figure:
244244

245245
X_files, score_data = [], []
246246
for file, data in sorted_files[: self.limit]:
247-
X_files.append(file)
247+
X_files.append(shorten_path(file))
248248
score_data.append(data["score"])
249249

250250
score_keys = score_data[0].keys()

codesectools/sasts/core/graphics.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
import shutil
99
import tempfile
10-
from pathlib import Path
1110

1211
import matplotlib
1312
import matplotlib.pyplot as plt
@@ -19,6 +18,7 @@
1918
from codesectools.datasets.core.dataset import FileDataset, GitRepoDataset
2019
from codesectools.sasts.core.sast import SAST
2120
from codesectools.shared.cwe import CWE
21+
from codesectools.utils import shorten_path
2222

2323
## Matplotlib config
2424
matplotlib.rcParams.update(
@@ -153,7 +153,7 @@ def plot_overview(self) -> Figure:
153153
project_name = self.result.name
154154

155155
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, layout="constrained")
156-
by_files = {Path(k).name: v for k, v in self.result.stats_by_files().items()}
156+
by_files = self.result.stats_by_files()
157157
by_checkers = self.result.stats_by_checkers()
158158
by_categories = self.result.stats_by_categories()
159159

@@ -163,7 +163,7 @@ def plot_overview(self) -> Figure:
163163
list(by_files.items()), key=lambda e: e[1]["count"], reverse=True
164164
)
165165
for k, v in sorted_files[: self.limit]:
166-
X_files.append(k)
166+
X_files.append(shorten_path(k))
167167
Y_files.append(v["count"])
168168

169169
COLORS_COUNT = {v: 0 for k, v in self.color_mapping.items()}
@@ -177,11 +177,11 @@ def plot_overview(self) -> Figure:
177177
current_height = 0
178178
for color, height in COLORS_COUNT.items():
179179
if height > 0:
180-
bars.append((k, current_height + height, color))
180+
bars.append((shorten_path(k), current_height + height, color))
181181
current_height += height
182182

183-
for k, height, color in bars[::-1]:
184-
ax1.bar(k, height, color=color)
183+
for k_short, height, color in bars[::-1]:
184+
ax1.bar(k_short, height, color=color)
185185

186186
ax1.set_xticks(X_files, X_files, rotation=45, ha="right")
187187
ax1.set_title(f"Stats by files (limit to {self.limit})")

codesectools/sasts/core/parser.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def __repr__(self) -> str:
7272
7373
"""
7474
return f"""{self.__class__.__name__}(
75-
file: \t{self.file}
75+
filepath: \t{self.filepath}
7676
checker: \t{self.checker}
7777
category: \t{self.category}
7878
cwe: \t{self.cwe}
@@ -127,6 +127,17 @@ def __init__(
127127
self.loc = loc
128128
self.data = data
129129

130+
# Ensure all defect filepaths are relative to the source path
131+
for defect in self.defects:
132+
if defect.filepath.is_absolute():
133+
try:
134+
defect.filepath = defect.filepath.relative_to(self.source_path)
135+
defect.filepath_str = str(defect.filepath)
136+
except ValueError:
137+
# This can happen if the path is outside the source_path tree.
138+
# We leave it as is, but it will likely not match during validation.
139+
pass
140+
130141
def __repr__(self) -> str:
131142
"""Return a developer-friendly string representation of the AnalysisResult.
132143

0 commit comments

Comments
 (0)