Skip to content

Commit 8d8406d

Browse files
authored
Merge pull request #51 from ai-forever/dev
v0.0.9
2 parents bf68dbf + 1822b22 commit 8d8406d

210 files changed

Lines changed: 13727 additions & 5707 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/code-quality.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
name: Code Quality
2+
on: [push]
3+
4+
jobs:
5+
build:
6+
runs-on: ubuntu-latest
7+
steps:
8+
- uses: actions/checkout@v4
9+
- uses: actions/setup-python@v4
10+
with:
11+
python-version: "3.9"
12+
- run: pip install --upgrade pip
13+
- run: pip install Cython
14+
- run: pip install .[dev,filters]
15+
- run: mypy DPF/
16+
- run: ruff check DPF/ scripts/ tests/ --config pyproject.toml
17+
- run: isort --check DPF/ tests/ scripts/
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
1-
name: Pylint
2-
1+
name: Pytest
32
on: [push]
43

54
jobs:
65
build:
76
runs-on: ubuntu-latest
87
strategy:
98
matrix:
10-
python-version: ["3.8", "3.9", "3.10"]
9+
python-version: ["3.9", "3.11"]
1110
steps:
12-
- uses: actions/checkout@v3
11+
- uses: actions/checkout@v4
1312
- name: Set up Python ${{ matrix.python-version }}
14-
uses: actions/setup-python@v3
13+
uses: actions/setup-python@v4
1514
with:
1615
python-version: ${{ matrix.python-version }}
1716
- name: Install dependencies
1817
run: |
1918
python -m pip install --upgrade pip
20-
pip install pylint
21-
- name: Analysing the code with pylint
19+
pip install Cython
20+
pip install .[dev,filters]
21+
- name: Running tests with pytest
2222
run: |
23-
pylint $(git ls-files '*.py')
23+
python -m pytest

.gitignore

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#
2+
/weights/
23
data/
3-
tests/
4-
pipelines/
4+
cache_dir/
55

66
# pycharm
77
.idea/

DPF/__init__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""DPF framework"""
2+
3+
__version__ = "1.0.0"
4+
5+
from .configs import (
6+
DatasetConfig,
7+
FilesDatasetConfig,
8+
ShardedFilesDatasetConfig,
9+
ShardsDatasetConfig,
10+
)
11+
from .connectors import Connector, LocalConnector, S3Connector
12+
from .dataset_reader import DatasetReader
13+
from .processors import (
14+
DatasetProcessor,
15+
FilesDatasetProcessor,
16+
ShardedFilesDatasetProcessor,
17+
ShardsDatasetProcessor,
18+
)

DPF/configs/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from .dataset_config import DatasetConfig
2+
from .files_config import FilesDatasetConfig
3+
from .sharded_config import ShardedDatasetConfig
4+
from .sharded_files_config import ShardedFilesDatasetConfig
5+
from .shards_config import ShardsDatasetConfig
6+
7+
8+
def config2format(config: DatasetConfig) -> str:
9+
if isinstance(config, ShardsDatasetConfig):
10+
return "shards"
11+
elif isinstance(config, ShardedFilesDatasetConfig):
12+
return "sharded_files"
13+
elif isinstance(config, FilesDatasetConfig):
14+
return "files"
15+
else:
16+
raise ValueError(f"Unknown config type: {config}")

DPF/configs/dataset_config.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from abc import ABC, abstractmethod
2+
3+
from DPF.datatypes import DataType
4+
from DPF.modalities import ModalityName
5+
6+
7+
class DatasetConfig(ABC):
8+
"""Config for a dataset"""
9+
10+
def __init__(self, path: str):
11+
assert not path.endswith('/')
12+
self.path = path
13+
14+
@property
15+
@abstractmethod
16+
def datatypes(self) -> list[DataType]:
17+
"""List of datatypes of a dataset"""
18+
pass
19+
20+
@property
21+
@abstractmethod
22+
def modality2datatype(self) -> dict[ModalityName, DataType]:
23+
"""Mapping modality to its datatype"""
24+
pass
25+
26+
@property
27+
@abstractmethod
28+
def user_column2default_column(self) -> dict[str, str]:
29+
pass
30+
31+
@property
32+
def user_column_names(self) -> list[str]:
33+
return list(self.user_column2default_column.keys())
34+
35+
@property
36+
def user_columns_to_rename(self) -> dict[str, str]:
37+
columns_to_rename = {}
38+
for k, v in self.user_column2default_column.items():
39+
if k != v:
40+
columns_to_rename[k] = v
41+
return columns_to_rename

DPF/configs/files_config.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import os
2+
from typing import Optional, Union
3+
4+
from DPF.datatypes import ColumnDataType, DataType, FileDataType
5+
from DPF.modalities import MODALITIES, ModalityName
6+
7+
from .dataset_config import DatasetConfig
8+
9+
10+
class FilesDatasetConfig(DatasetConfig):
11+
"""Config for Files dataset type"""
12+
13+
def __init__(
14+
self,
15+
path: str,
16+
datatypes: list[Union[FileDataType, ColumnDataType]],
17+
):
18+
"""
19+
Parameters
20+
----------
21+
path: str
22+
Path to dataset metadata file
23+
datatypes: list[Union[FileDataType, ColumnDataType]]
24+
List of datatypes in dataset
25+
"""
26+
super().__init__(path)
27+
self.table_path = path
28+
self.base_path = os.path.dirname(self.table_path)
29+
self._datatypes = datatypes
30+
self._modality2datatype = {d.modality.name: d for d in datatypes}
31+
32+
assert len({d.modality.name for d in datatypes}) == len(datatypes), \
33+
"More than one datatype with same modality is not supported"
34+
for data in self.datatypes:
35+
assert isinstance(data, (ColumnDataType, FileDataType))
36+
37+
@property
38+
def datatypes(self) -> list[DataType]:
39+
return self._datatypes # type: ignore
40+
41+
@property
42+
def modality2datatype(self) -> dict[ModalityName, DataType]:
43+
return self._modality2datatype # type: ignore
44+
45+
@property
46+
def user_column2default_column(self) -> dict[str, str]:
47+
mapping = {}
48+
for data in self.datatypes:
49+
if isinstance(data, ColumnDataType):
50+
mapping[data.user_column_name] = data.column_name
51+
elif isinstance(data, FileDataType):
52+
mapping[data.user_path_column_name] = data.modality.path_column
53+
return mapping
54+
55+
@classmethod
56+
def from_path_and_columns(
57+
cls,
58+
path: str,
59+
image_path_col: Optional[str] = None,
60+
video_path_col: Optional[str] = None,
61+
text_col: Optional[str] = None,
62+
) -> "FilesDatasetConfig":
63+
"""
64+
Parameters
65+
----------
66+
path: str
67+
Path to dataset metadata file
68+
image_path_col: Optional[str] = None
69+
Name of column with image paths
70+
video_path_col: Optional[str] = None
71+
Name of column with video paths
72+
text_col: Optional[str] = None
73+
Name of column with text
74+
75+
Returns
76+
-------
77+
FilesDatasetConfig
78+
Instance of itself
79+
"""
80+
datatypes: list[Union[FileDataType, ColumnDataType]] = []
81+
if image_path_col:
82+
datatypes.append(FileDataType(MODALITIES['image'], image_path_col))
83+
if video_path_col:
84+
datatypes.append(FileDataType(MODALITIES['video'], video_path_col))
85+
if text_col:
86+
datatypes.append(ColumnDataType(MODALITIES['text'], text_col))
87+
assert len(datatypes) > 0, "At least one modality should be provided"
88+
return cls(path, datatypes)
89+
90+
def __repr__(self) -> str:
91+
s = "FilesDatasetConfig(\n\t"
92+
s += f'table_path="{self.table_path}",\n\t'
93+
s += 'datatypes=[\n\t\t'
94+
s += '\n\t\t'.join([str(i) for i in self.datatypes])
95+
s += '\n\t]'
96+
s += '\n)'
97+
return s

DPF/configs/sharded_config.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from typing import Union
2+
3+
from DPF.datatypes import ColumnDataType, DataType, ShardedDataType
4+
5+
from ..modalities import ModalityName
6+
from .dataset_config import DatasetConfig
7+
8+
9+
class ShardedDatasetConfig(DatasetConfig):
10+
11+
def __init__(
12+
self,
13+
path: str,
14+
datatypes: list[Union[ShardedDataType, ColumnDataType]],
15+
datafiles_ext: str = "csv",
16+
):
17+
super().__init__(path)
18+
self._datatypes = datatypes
19+
self.datafiles_ext = datafiles_ext.lstrip('.')
20+
self._modality2datatype = {d.modality.name: d for d in datatypes}
21+
22+
assert len({d.modality.name for d in datatypes}) == len(datatypes), \
23+
"More than one datatype with same modality is not supported"
24+
for data in self.datatypes:
25+
assert isinstance(data, (ColumnDataType, ShardedDataType))
26+
27+
@property
28+
def datatypes(self) -> list[DataType]:
29+
return self._datatypes # type: ignore
30+
31+
@property
32+
def modality2datatype(self) -> dict[ModalityName, DataType]:
33+
return self._modality2datatype # type: ignore
34+
35+
@property
36+
def user_column2default_column(self) -> dict[str, str]:
37+
mapping = {}
38+
for data in self.datatypes:
39+
if isinstance(data, ColumnDataType):
40+
mapping[data.user_column_name] = data.column_name
41+
elif isinstance(data, ShardedDataType):
42+
mapping[data.user_basename_column_name] = data.modality.sharded_file_name_column
43+
return mapping
44+
45+
def __repr__(self) -> str:
46+
s = "ShardedDatasetConfig(\n\t"
47+
s += f'path="{self.path}",\n\t'
48+
s += f'datafiles_ext="{self.datafiles_ext}",\n\t'
49+
s += 'datatypes=[\n\t\t'
50+
s += '\n\t\t'.join([str(i) for i in self.datatypes])
51+
s += '\n\t]'
52+
s += '\n)'
53+
return s
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from typing import Optional, Union
2+
3+
from DPF.datatypes import ColumnDataType, ShardedDataType
4+
from DPF.modalities import MODALITIES
5+
6+
from .sharded_config import ShardedDatasetConfig
7+
8+
9+
class ShardedFilesDatasetConfig(ShardedDatasetConfig):
10+
"""Config for ShardedFiles dataset type"""
11+
12+
def __init__(
13+
self,
14+
path: str,
15+
datatypes: list[Union[ShardedDataType, ColumnDataType]],
16+
datafiles_ext: str = "csv",
17+
):
18+
"""
19+
Parameters
20+
----------
21+
path: str
22+
Path to directory with shards
23+
datatypes: list[Union[ShardedDataType, ColumnDataType]]
24+
List of datatypes in dataset
25+
datafiles_ext: str = "csv"
26+
Extension of files with metadata in shards
27+
"""
28+
super().__init__(path, datatypes, datafiles_ext)
29+
30+
@classmethod
31+
def from_path_and_columns(
32+
cls,
33+
path: str,
34+
image_name_col: Optional[str] = None,
35+
video_name_col: Optional[str] = None,
36+
text_col: Optional[str] = None,
37+
datafiles_ext: str = "csv",
38+
) -> "ShardedFilesDatasetConfig":
39+
"""
40+
Parameters
41+
----------
42+
path: str
43+
Path to directory with shards
44+
image_name_col: Optional[str] = None
45+
Name of column with image filenames in shard
46+
video_name_col: Optional[str] = None
47+
Name of column with video filenames in shard
48+
text_col: Optional[str] = None
49+
Name of column with text
50+
datafiles_ext: str = "csv"
51+
Extension of files with metadata in shards
52+
53+
Returns
54+
-------
55+
ShardedFilesDatasetConfig
56+
Instance of itself
57+
"""
58+
datatypes: list[Union[ShardedDataType, ColumnDataType]] = []
59+
if image_name_col:
60+
datatypes.append(ShardedDataType(MODALITIES['image'], image_name_col))
61+
if video_name_col:
62+
datatypes.append(ShardedDataType(MODALITIES['video'], video_name_col))
63+
if text_col:
64+
datatypes.append(ColumnDataType(MODALITIES['text'], text_col))
65+
assert len(datatypes) > 0, "At least one modality should be provided"
66+
return cls(path, datatypes, datafiles_ext=datafiles_ext)

0 commit comments

Comments
 (0)