99from __future__ import annotations
1010
1111from abc import ABC , abstractmethod
12+ from pathlib import Path
1213from typing import TYPE_CHECKING
1314
1415import git
2122from codesectools .utils import USER_CACHE_DIR
2223
2324if TYPE_CHECKING :
24- from pathlib import Path
2525 from typing import Self
2626
2727 from codesectools .sasts .core .parser import AnalysisResult , Defect
@@ -197,7 +197,7 @@ class File(DatasetUnit):
197197 """Represent a single file in a dataset.
198198
199199 Attributes:
200- filename (str ): The name of the file.
200+ filepath (Path ): The relative path to the file.
201201 content (bytes): The byte content of the file.
202202 cwes (list[CWE]): A list of CWEs associated with the file.
203203 has_vuln (bool): True if the vulnerability is real, False if it's
@@ -206,20 +206,21 @@ class File(DatasetUnit):
206206 """
207207
208208 def __init__ (
209- self , filename : str , content : str | bytes , cwes : list [CWE ], has_vuln : bool
209+ self , filepath : Path , content : str | bytes , cwes : list [CWE ], has_vuln : bool
210210 ) -> None :
211211 """Initialize a File instance.
212212
213213 Args:
214- filename : The name of the file.
214+ filepath : The relative path of the file.
215215 content: The content of the file, as a string or bytes. It will be
216216 converted to bytes if provided as a string.
217217 cwes: A list of CWEs associated with the file.
218218 has_vuln: True if the vulnerability is real, False if it's
219219 intended to be a false positive test case.
220220
221221 """
222- self .filename = filename
222+ self .filepath = filepath
223+ self .filename = self .filepath .name
223224 self .content = content
224225 self .cwes = cwes
225226 self .has_vuln = has_vuln
@@ -231,29 +232,29 @@ def __repr__(self) -> str:
231232 """Return a developer-friendly string representation of the File.
232233
233234 Returns:
234- A string showing the class name, filename , and CWE IDs.
235+ A string showing the class name, filepath , and CWE IDs.
235236
236237 """
237238 return f"""{ self .__class__ .__name__ } (
238- filename : \t { self .filename }
239+ filepath : \t { self .filepath }
239240 cwes: \t { self .cwes }
240241)"""
241242
242- def __eq__ (self , other : str | Self ) -> bool :
243- """Compare this File with another object for equality based on filename .
243+ def __eq__ (self , other : str | Path | Self ) -> bool :
244+ """Compare this File with another object for equality based on filepath .
244245
245246 Args:
246- other: The object to compare with. Can be a string (filename ) or
247+ other: The object to compare with. Can be a string/Path (filepath ) or
247248 another File instance.
248249
249250 Returns:
250- True if the filenames are equal, False otherwise.
251+ True if the filepaths are equal, False otherwise.
251252
252253 """
253- if isinstance (other , str ):
254- return self .filename == other
254+ if isinstance (other , ( str , Path ) ):
255+ return self .filepath == Path ( other )
255256 elif isinstance (other , self .__class__ ):
256- return self .filename == other .filename
257+ return self .filepath == other .filepath
257258 else :
258259 return False
259260
@@ -264,7 +265,9 @@ def save(self, dir: Path) -> None:
264265 dir: The path to the directory where the file should be saved.
265266
266267 """
267- (dir / self .filename ).write_bytes (self .content )
268+ target_path = dir / self .filepath
269+ target_path .parent .mkdir (parents = True , exist_ok = True )
270+ target_path .write_bytes (self .content )
268271
269272
270273class FileDataset (Dataset ):
@@ -303,7 +306,7 @@ def validate(self, analysis_result: AnalysisResult) -> FileDatasetData:
303306 """
304307 # 1. Prepare ground truth from all files in the dataset
305308 ground_truth : dict [str , tuple [bool , set [CWE ]]] = {
306- file .filename : (file .has_vuln , set (file .cwes )) for file in self .files
309+ str ( file .filepath ) : (file .has_vuln , set (file .cwes )) for file in self .files
307310 }
308311
309312 # 2. Process reported defects to get unique (file, cwe) pairs
@@ -313,32 +316,32 @@ def validate(self, analysis_result: AnalysisResult) -> FileDatasetData:
313316 if not defect .cwe or defect .cwe .id == - 1 :
314317 continue
315318
316- file_cwe_pair = (defect .filename , defect .cwe )
319+ file_cwe_pair = (str ( defect .filepath ) , defect .cwe )
317320 if file_cwe_pair not in unique_reported_defects :
318321 unique_reported_defects [file_cwe_pair ] = defect
319322
320323 # 3. Classify unique reported vulnerabilities as TP or FP
321324 tp_defects_map : dict [tuple [str , CWE ], Defect ] = {}
322325 fp_defects_map : dict [tuple [str , CWE ], Defect ] = {}
323326
324- for (filename , cwe ), defect in unique_reported_defects .items ():
325- has_vuln , expected_cwes = ground_truth .get (filename , (False , set ()))
327+ for (filepath , cwe ), defect in unique_reported_defects .items ():
328+ has_vuln , expected_cwes = ground_truth .get (filepath , (False , set ()))
326329
327330 if has_vuln and cwe in expected_cwes :
328331 # Correctly identified a vulnerability
329- tp_defects_map [(filename , cwe )] = defect
332+ tp_defects_map [(filepath , cwe )] = defect
330333 else :
331334 # Reported a vuln in a non-vulnerable file, with wrong CWE,
332335 # or in a file not part of the dataset.
333- fp_defects_map [(filename , cwe )] = defect
336+ fp_defects_map [(filepath , cwe )] = defect
334337
335338 # 4. Determine False Negatives by finding what was missed from the ground truth.
336339 fn_defects_set : set [tuple [str , CWE ]] = set ()
337- for filename , (has_vuln , expected_cwes ) in ground_truth .items ():
340+ for filepath , (has_vuln , expected_cwes ) in ground_truth .items ():
338341 if has_vuln :
339342 for expected_cwe in expected_cwes :
340- if (filename , expected_cwe ) not in tp_defects_map :
341- fn_defects_set .add ((filename , expected_cwe ))
343+ if (filepath , expected_cwe ) not in tp_defects_map :
344+ fn_defects_set .add ((filepath , expected_cwe ))
342345
343346 # 5. Convert maps and sets to lists of objects for downstream use
344347 tp_defects = list (tp_defects_map .values ())
@@ -354,7 +357,7 @@ def validate(self, analysis_result: AnalysisResult) -> FileDatasetData:
354357 fp_cwes = [cwe for _ , cwe in fp_defects_map .keys ()]
355358 fn_cwes = [cwe for _ , cwe in fn_defects_set ]
356359
357- unique_correct_number = len ({filename for filename , _ in tp_defects_map .keys ()})
360+ unique_correct_number = len ({filepath for filepath , _ in tp_defects_map .keys ()})
358361
359362 return FileDatasetData (
360363 dataset = self ,
0 commit comments