ai-forever
diff --git a/‎.github/workflows/code-quality.yml‎
Lines changed: 17 additions & 0 deletions b/‎.github/workflows/code-quality.yml‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎.github/workflows/pylint.yml‎ ‎.github/workflows/pytest.yml‎.github/workflows/pylint.yml renamed to .github/workflows/pytest.yml
Lines changed: 8 additions & 8 deletions b/‎.github/workflows/pylint.yml‎ ‎.github/workflows/pytest.yml‎.github/workflows/pylint.yml renamed to .github/workflows/pytest.yml
Lines changed: 8 additions & 8 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 2 deletions b/‎.gitignore‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎DPF/__init__.py‎
Lines changed: 18 additions & 0 deletions b/‎DPF/__init__.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎DPF/configs/__init__.py‎
Lines changed: 16 additions & 0 deletions b/‎DPF/configs/__init__.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎DPF/configs/dataset_config.py‎
Lines changed: 41 additions & 0 deletions b/‎DPF/configs/dataset_config.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎DPF/configs/files_config.py‎
Lines changed: 97 additions & 0 deletions b/‎DPF/configs/files_config.py‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎DPF/configs/sharded_config.py‎
Lines changed: 53 additions & 0 deletions b/‎DPF/configs/sharded_config.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎DPF/configs/sharded_files_config.py‎
Lines changed: 66 additions & 0 deletions b/‎DPF/configs/sharded_files_config.py‎
Lines changed: 66 additions & 0 deletions
@@ -0,0 +1,17 @@
+name: Code Quality
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v4
+      with:
+        python-version: "3.9"
+    - run: pip install --upgrade pip
+    - run: pip install Cython
+    - run: pip install .[dev,filters]
+    - run: mypy DPF/
+    - run: ruff check DPF/ scripts/ tests/ --config pyproject.toml
+    - run: isort --check DPF/ tests/ scripts/
@@ -1,23 +1,23 @@
-name: Pylint
-
+name: Pytest
 on: [push]
 
 jobs:
   build:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.9", "3.11"]
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install pylint
-    - name: Analysing the code with pylint
+        pip install Cython
+        pip install .[dev,filters]
+    - name: Running tests with pytest
       run: |
-        pylint $(git ls-files '*.py')
+        python -m pytest
@@ -1,7 +1,7 @@
 #
+/weights/
 data/
-tests/
-pipelines/
+cache_dir/
 
 # pycharm
 .idea/
 
@@ -0,0 +1,18 @@
+"""DPF framework"""
+
+__version__ = "1.0.0"
+
+from .configs import (
+    DatasetConfig,
+    FilesDatasetConfig,
+    ShardedFilesDatasetConfig,
+    ShardsDatasetConfig,
+)
+from .connectors import Connector, LocalConnector, S3Connector
+from .dataset_reader import DatasetReader
+from .processors import (
+    DatasetProcessor,
+    FilesDatasetProcessor,
+    ShardedFilesDatasetProcessor,
+    ShardsDatasetProcessor,
+)
@@ -0,0 +1,16 @@
+from .dataset_config import DatasetConfig
+from .files_config import FilesDatasetConfig
+from .sharded_config import ShardedDatasetConfig
+from .sharded_files_config import ShardedFilesDatasetConfig
+from .shards_config import ShardsDatasetConfig
+
+
+def config2format(config: DatasetConfig) -> str:
+    if isinstance(config, ShardsDatasetConfig):
+        return "shards"
+    elif isinstance(config, ShardedFilesDatasetConfig):
+        return "sharded_files"
+    elif isinstance(config, FilesDatasetConfig):
+        return "files"
+    else:
+        raise ValueError(f"Unknown config type: {config}")
@@ -0,0 +1,41 @@
+from abc import ABC, abstractmethod
+
+from DPF.datatypes import DataType
+from DPF.modalities import ModalityName
+
+
+class DatasetConfig(ABC):
+    """Config for a dataset"""
+
+    def __init__(self, path: str):
+        assert not path.endswith('/')
+        self.path = path
+
+    @property
+    @abstractmethod
+    def datatypes(self) -> list[DataType]:
+        """List of datatypes of a dataset"""
+        pass
+
+    @property
+    @abstractmethod
+    def modality2datatype(self) -> dict[ModalityName, DataType]:
+        """Mapping modality to its datatype"""
+        pass
+
+    @property
+    @abstractmethod
+    def user_column2default_column(self) -> dict[str, str]:
+        pass
+
+    @property
+    def user_column_names(self) -> list[str]:
+        return list(self.user_column2default_column.keys())
+
+    @property
+    def user_columns_to_rename(self) -> dict[str, str]:
+        columns_to_rename = {}
+        for k, v in self.user_column2default_column.items():
+            if k != v:
+                columns_to_rename[k] = v
+        return columns_to_rename
@@ -0,0 +1,97 @@
+import os
+from typing import Optional, Union
+
+from DPF.datatypes import ColumnDataType, DataType, FileDataType
+from DPF.modalities import MODALITIES, ModalityName
+
+from .dataset_config import DatasetConfig
+
+
+class FilesDatasetConfig(DatasetConfig):
+    """Config for Files dataset type"""
+
+    def __init__(
+        self,
+        path: str,
+        datatypes: list[Union[FileDataType, ColumnDataType]],
+    ):
+        """
+        Parameters
+        ----------
+        path: str
+            Path to dataset metadata file
+        datatypes: list[Union[FileDataType, ColumnDataType]]
+            List of datatypes in dataset
+        """
+        super().__init__(path)
+        self.table_path = path
+        self.base_path = os.path.dirname(self.table_path)
+        self._datatypes = datatypes
+        self._modality2datatype = {d.modality.name: d for d in datatypes}
+
+        assert len({d.modality.name for d in datatypes}) == len(datatypes), \
+            "More than one datatype with same modality is not supported"
+        for data in self.datatypes:
+            assert isinstance(data, (ColumnDataType, FileDataType))
+
+    @property
+    def datatypes(self) -> list[DataType]:
+        return self._datatypes  # type: ignore
+
+    @property
+    def modality2datatype(self) -> dict[ModalityName, DataType]:
+        return self._modality2datatype  # type: ignore
+
+    @property
+    def user_column2default_column(self) -> dict[str, str]:
+        mapping = {}
+        for data in self.datatypes:
+            if isinstance(data, ColumnDataType):
+                mapping[data.user_column_name] = data.column_name
+            elif isinstance(data, FileDataType):
+                mapping[data.user_path_column_name] = data.modality.path_column
+        return mapping
+
+    @classmethod
+    def from_path_and_columns(
+        cls,
+        path: str,
+        image_path_col: Optional[str] = None,
+        video_path_col: Optional[str] = None,
+        text_col: Optional[str] = None,
+    ) -> "FilesDatasetConfig":
+        """
+        Parameters
+        ----------
+        path: str
+            Path to dataset metadata file
+        image_path_col: Optional[str] = None
+            Name of column with image paths
+        video_path_col: Optional[str] = None
+            Name of column with video paths
+        text_col: Optional[str] = None
+            Name of column with text
+
+        Returns
+        -------
+        FilesDatasetConfig
+            Instance of itself
+        """
+        datatypes: list[Union[FileDataType, ColumnDataType]] = []
+        if image_path_col:
+            datatypes.append(FileDataType(MODALITIES['image'], image_path_col))
+        if video_path_col:
+            datatypes.append(FileDataType(MODALITIES['video'], video_path_col))
+        if text_col:
+            datatypes.append(ColumnDataType(MODALITIES['text'], text_col))
+        assert len(datatypes) > 0, "At least one modality should be provided"
+        return cls(path, datatypes)
+
+    def __repr__(self) -> str:
+        s = "FilesDatasetConfig(\n\t"
+        s += f'table_path="{self.table_path}",\n\t'
+        s += 'datatypes=[\n\t\t'
+        s += '\n\t\t'.join([str(i) for i in self.datatypes])
+        s += '\n\t]'
+        s += '\n)'
+        return s
@@ -0,0 +1,53 @@
+from typing import Union
+
+from DPF.datatypes import ColumnDataType, DataType, ShardedDataType
+
+from ..modalities import ModalityName
+from .dataset_config import DatasetConfig
+
+
+class ShardedDatasetConfig(DatasetConfig):
+
+    def __init__(
+        self,
+        path: str,
+        datatypes: list[Union[ShardedDataType, ColumnDataType]],
+        datafiles_ext: str = "csv",
+    ):
+        super().__init__(path)
+        self._datatypes = datatypes
+        self.datafiles_ext = datafiles_ext.lstrip('.')
+        self._modality2datatype = {d.modality.name: d for d in datatypes}
+
+        assert len({d.modality.name for d in datatypes}) == len(datatypes), \
+            "More than one datatype with same modality is not supported"
+        for data in self.datatypes:
+            assert isinstance(data, (ColumnDataType, ShardedDataType))
+
+    @property
+    def datatypes(self) -> list[DataType]:
+        return self._datatypes  # type: ignore
+
+    @property
+    def modality2datatype(self) -> dict[ModalityName, DataType]:
+        return self._modality2datatype  # type: ignore
+
+    @property
+    def user_column2default_column(self) -> dict[str, str]:
+        mapping = {}
+        for data in self.datatypes:
+            if isinstance(data, ColumnDataType):
+                mapping[data.user_column_name] = data.column_name
+            elif isinstance(data, ShardedDataType):
+                mapping[data.user_basename_column_name] = data.modality.sharded_file_name_column
+        return mapping
+
+    def __repr__(self) -> str:
+        s = "ShardedDatasetConfig(\n\t"
+        s += f'path="{self.path}",\n\t'
+        s += f'datafiles_ext="{self.datafiles_ext}",\n\t'
+        s += 'datatypes=[\n\t\t'
+        s += '\n\t\t'.join([str(i) for i in self.datatypes])
+        s += '\n\t]'
+        s += '\n)'
+        return s
@@ -0,0 +1,66 @@
+from typing import Optional, Union
+
+from DPF.datatypes import ColumnDataType, ShardedDataType
+from DPF.modalities import MODALITIES
+
+from .sharded_config import ShardedDatasetConfig
+
+
+class ShardedFilesDatasetConfig(ShardedDatasetConfig):
+    """Config for ShardedFiles dataset type"""
+
+    def __init__(
+        self,
+        path: str,
+        datatypes: list[Union[ShardedDataType, ColumnDataType]],
+        datafiles_ext: str = "csv",
+    ):
+        """
+        Parameters
+        ----------
+        path: str
+            Path to directory with shards
+        datatypes: list[Union[ShardedDataType, ColumnDataType]]
+            List of datatypes in dataset
+        datafiles_ext: str = "csv"
+            Extension of files with metadata in shards
+        """
+        super().__init__(path, datatypes, datafiles_ext)
+
+    @classmethod
+    def from_path_and_columns(
+        cls,
+        path: str,
+        image_name_col: Optional[str] = None,
+        video_name_col: Optional[str] = None,
+        text_col: Optional[str] = None,
+        datafiles_ext: str = "csv",
+    ) -> "ShardedFilesDatasetConfig":
+        """
+        Parameters
+        ----------
+        path: str
+            Path to directory with shards
+        image_name_col: Optional[str] = None
+            Name of column with image filenames in shard
+        video_name_col: Optional[str] = None
+            Name of column with video filenames in shard
+        text_col: Optional[str] = None
+            Name of column with text
+        datafiles_ext: str = "csv"
+            Extension of files with metadata in shards
+
+        Returns
+        -------
+        ShardedFilesDatasetConfig
+            Instance of itself
+        """
+        datatypes: list[Union[ShardedDataType, ColumnDataType]] = []
+        if image_name_col:
+            datatypes.append(ShardedDataType(MODALITIES['image'], image_name_col))
+        if video_name_col:
+            datatypes.append(ShardedDataType(MODALITIES['video'], video_name_col))
+        if text_col:
+            datatypes.append(ColumnDataType(MODALITIES['text'], text_col))
+        assert len(datatypes) > 0, "At least one modality should be provided"
+        return cls(path, datatypes, datafiles_ext=datafiles_ext)
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`#`
	`2`	`+/weights/`
`2`	`3`	`data/`
`3`		`-tests/`
`4`		`-pipelines/`
	`4`	`+cache_dir/`
`5`	`5`
`6`	`6`	`# pycharm`
`7`	`7`	`.idea/`