Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 28 additions & 56 deletions _duckdb-stubs/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -309,22 +309,15 @@ class DuckDBPyConnection:
strict_mode: bool | None = None,
) -> DuckDBPyRelation: ...
def from_df(self, df: pandas.DataFrame) -> DuckDBPyRelation: ...
@typing.overload
def from_parquet(
self,
file_glob: str,
binary_as_string: bool = False,
*,
file_row_number: bool = False,
filename: bool = False,
hive_partitioning: bool = False,
union_by_name: bool = False,
compression: ParquetCompression | None = None,
) -> DuckDBPyRelation: ...
@typing.overload
def from_parquet(
self,
file_globs: Sequence[str],
path_or_buffer: str
| bytes
| os.PathLike[str]
| os.PathLike[bytes]
| typing.IO[bytes]
| typing.IO[str]
| Sequence[str | bytes | os.PathLike[str] | os.PathLike[bytes] | typing.IO[bytes] | typing.IO[str]],
binary_as_string: bool = False,
*,
file_row_number: bool = False,
Expand Down Expand Up @@ -433,22 +426,15 @@ class DuckDBPyConnection:
hive_types: HiveTypes | None = None,
hive_types_autocast: bool | None = None,
) -> DuckDBPyRelation: ...
@typing.overload
def read_parquet(
self,
file_glob: str,
binary_as_string: bool = False,
*,
file_row_number: bool = False,
filename: bool = False,
hive_partitioning: bool = False,
union_by_name: bool = False,
compression: ParquetCompression | None = None,
) -> DuckDBPyRelation: ...
@typing.overload
def read_parquet(
self,
file_globs: Sequence[str],
path_or_buffer: str
| bytes
| os.PathLike[str]
| os.PathLike[bytes]
| typing.IO[bytes]
| typing.IO[str]
| Sequence[str | bytes | os.PathLike[str] | os.PathLike[bytes] | typing.IO[bytes] | typing.IO[str]],
binary_as_string: bool = False,
*,
file_row_number: bool = False,
Expand Down Expand Up @@ -1061,21 +1047,14 @@ def from_csv_auto(
strict_mode: bool | None = None,
) -> DuckDBPyRelation: ...
def from_df(df: pandas.DataFrame, *, connection: DuckDBPyConnection | None = None) -> DuckDBPyRelation: ...
@typing.overload
def from_parquet(
file_glob: str,
binary_as_string: bool = False,
*,
file_row_number: bool = False,
filename: bool = False,
hive_partitioning: bool = False,
union_by_name: bool = False,
compression: ParquetCompression | None = None,
connection: DuckDBPyConnection | None = None,
) -> DuckDBPyRelation: ...
@typing.overload
def from_parquet(
file_globs: Sequence[str],
path_or_buffer: str
| bytes
| os.PathLike[str]
| os.PathLike[bytes]
| typing.IO[bytes]
| typing.IO[str]
| Sequence[str | bytes | os.PathLike[str] | os.PathLike[bytes] | typing.IO[bytes] | typing.IO[str]],
binary_as_string: bool = False,
*,
file_row_number: bool = False,
Expand Down Expand Up @@ -1232,21 +1211,14 @@ def read_json(
hive_types: HiveTypes | None = None,
hive_types_autocast: bool | None = None,
) -> DuckDBPyRelation: ...
@typing.overload
def read_parquet(
file_glob: str,
binary_as_string: bool = False,
*,
file_row_number: bool = False,
filename: bool = False,
hive_partitioning: bool = False,
union_by_name: bool = False,
compression: ParquetCompression | None = None,
connection: DuckDBPyConnection | None = None,
) -> DuckDBPyRelation: ...
@typing.overload
def read_parquet(
file_globs: Sequence[str],
path_or_buffer: str
| bytes
| os.PathLike[str]
| os.PathLike[bytes]
| typing.IO[bytes]
| typing.IO[str]
| Sequence[str | bytes | os.PathLike[str] | os.PathLike[bytes] | typing.IO[bytes] | typing.IO[str]],
binary_as_string: bool = False,
*,
file_row_number: bool = False,
Expand Down
53 changes: 3 additions & 50 deletions scripts/connection_methods.json
Original file line number Diff line number Diff line change
Expand Up @@ -941,11 +941,11 @@
"read_parquet"
],
"function": "FromParquet",
"docs": "Create a relation object from the Parquet files in file_glob",
"docs": "Create a relation object from the Parquet path(s) or file-like object(s) in 'path_or_buffer'",
"args": [
{
"name": "file_glob",
"type": "str"
"name": "path_or_buffer",
"type": "Union[str, bytes, os.PathLike, IO[bytes], IO[str], Sequence[Union[str, bytes, os.PathLike, IO[bytes], IO[str]]]]"
},
{
"name": "binary_as_string",
Expand Down Expand Up @@ -982,53 +982,6 @@
],
"return": "DuckDBPyRelation"
},
{
"name": [
"from_parquet",
"read_parquet"
],
"function": "FromParquets",
"docs": "Create a relation object from the Parquet files in file_globs",
"args": [
{
"name": "file_globs",
"type": "List[str]"
},
{
"name": "binary_as_string",
"default": "False",
"type": "bool"
}
],
"kwargs": [
{
"name": "file_row_number",
"default": "False",
"type": "bool"
},
{
"name": "filename",
"default": "False",
"type": "bool"
},
{
"name": "hive_partitioning",
"default": "False",
"type": "bool"
},
{
"name": "union_by_name",
"default": "False",
"type": "bool"
},
{
"name": "compression",
"default": "None",
"type": "str"
}
],
"return": "DuckDBPyRelation"
},
{
"name": "get_table_names",
"function": "GetTableNames",
Expand Down
58 changes: 14 additions & 44 deletions src/duckdb_py/duckdb_python.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -748,64 +748,34 @@ static void InitializeConnectionMethods(py::module_ &m) {
py::arg("connection") = py::none());
m.def(
"from_parquet",
[](const string &file_glob, bool binary_as_string, bool file_row_number, bool filename, bool hive_partitioning,
bool union_by_name, const py::object &compression = py::none(),
shared_ptr<DuckDBPyConnection> conn = nullptr) {
if (!conn) {
conn = DuckDBPyConnection::DefaultConnection();
}
return conn->FromParquet(file_glob, binary_as_string, file_row_number, filename, hive_partitioning,
union_by_name, compression);
},
"Create a relation object from the Parquet files in file_glob", py::arg("file_glob"),
py::arg("binary_as_string") = false, py::kw_only(), py::arg("file_row_number") = false,
py::arg("filename") = false, py::arg("hive_partitioning") = false, py::arg("union_by_name") = false,
py::arg("compression") = py::none(), py::arg("connection") = py::none());
m.def(
"read_parquet",
[](const string &file_glob, bool binary_as_string, bool file_row_number, bool filename, bool hive_partitioning,
bool union_by_name, const py::object &compression = py::none(),
shared_ptr<DuckDBPyConnection> conn = nullptr) {
if (!conn) {
conn = DuckDBPyConnection::DefaultConnection();
}
return conn->FromParquet(file_glob, binary_as_string, file_row_number, filename, hive_partitioning,
union_by_name, compression);
},
"Create a relation object from the Parquet files in file_glob", py::arg("file_glob"),
py::arg("binary_as_string") = false, py::kw_only(), py::arg("file_row_number") = false,
py::arg("filename") = false, py::arg("hive_partitioning") = false, py::arg("union_by_name") = false,
py::arg("compression") = py::none(), py::arg("connection") = py::none());
m.def(
"from_parquet",
[](const vector<string> &file_globs, bool binary_as_string, bool file_row_number, bool filename,
[](const py::object &path_or_buffer, bool binary_as_string, bool file_row_number, bool filename,
bool hive_partitioning, bool union_by_name, const py::object &compression = py::none(),
shared_ptr<DuckDBPyConnection> conn = nullptr) {
if (!conn) {
conn = DuckDBPyConnection::DefaultConnection();
}
return conn->FromParquets(file_globs, binary_as_string, file_row_number, filename, hive_partitioning,
union_by_name, compression);
return conn->FromParquet(path_or_buffer, binary_as_string, file_row_number, filename, hive_partitioning,
union_by_name, compression);
},
"Create a relation object from the Parquet files in file_globs", py::arg("file_globs"),
py::arg("binary_as_string") = false, py::kw_only(), py::arg("file_row_number") = false,
py::arg("filename") = false, py::arg("hive_partitioning") = false, py::arg("union_by_name") = false,
py::arg("compression") = py::none(), py::arg("connection") = py::none());
"Create a relation object from the Parquet path(s) or file-like object(s) in 'path_or_buffer'",
py::arg("path_or_buffer"), py::arg("binary_as_string") = false, py::kw_only(),
py::arg("file_row_number") = false, py::arg("filename") = false, py::arg("hive_partitioning") = false,
py::arg("union_by_name") = false, py::arg("compression") = py::none(), py::arg("connection") = py::none());
m.def(
"read_parquet",
[](const vector<string> &file_globs, bool binary_as_string, bool file_row_number, bool filename,
[](const py::object &path_or_buffer, bool binary_as_string, bool file_row_number, bool filename,
bool hive_partitioning, bool union_by_name, const py::object &compression = py::none(),
shared_ptr<DuckDBPyConnection> conn = nullptr) {
if (!conn) {
conn = DuckDBPyConnection::DefaultConnection();
}
return conn->FromParquets(file_globs, binary_as_string, file_row_number, filename, hive_partitioning,
union_by_name, compression);
return conn->FromParquet(path_or_buffer, binary_as_string, file_row_number, filename, hive_partitioning,
union_by_name, compression);
},
"Create a relation object from the Parquet files in file_globs", py::arg("file_globs"),
py::arg("binary_as_string") = false, py::kw_only(), py::arg("file_row_number") = false,
py::arg("filename") = false, py::arg("hive_partitioning") = false, py::arg("union_by_name") = false,
py::arg("compression") = py::none(), py::arg("connection") = py::none());
"Create a relation object from the Parquet path(s) or file-like object(s) in 'path_or_buffer'",
py::arg("path_or_buffer"), py::arg("binary_as_string") = false, py::kw_only(),
py::arg("file_row_number") = false, py::arg("filename") = false, py::arg("hive_partitioning") = false,
py::arg("union_by_name") = false, py::arg("compression") = py::none(), py::arg("connection") = py::none());
m.def(
"get_table_names",
[](const string &query, bool qualified, shared_ptr<DuckDBPyConnection> conn = nullptr) {
Expand Down
13 changes: 3 additions & 10 deletions src/duckdb_py/include/duckdb_python/pyconnection/pyconnection.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,16 +293,9 @@ struct DuckDBPyConnection : public enable_shared_from_this<DuckDBPyConnection> {

unique_ptr<DuckDBPyRelation> FromDF(const PandasDataFrame &value);

unique_ptr<DuckDBPyRelation> FromParquet(const string &file_glob, bool binary_as_string, bool file_row_number,
bool filename, bool hive_partitioning, bool union_by_name,
const py::object &compression = py::none());
unique_ptr<DuckDBPyRelation> FromParquets(const vector<string> &file_globs, bool binary_as_string,
bool file_row_number, bool filename, bool hive_partitioning,
bool union_by_name, const py::object &compression = py::none());

unique_ptr<DuckDBPyRelation> FromParquetInternal(Value &&file_param, bool binary_as_string, bool file_row_number,
bool filename, bool hive_partitioning, bool union_by_name,
const py::object &compression = py::none());
unique_ptr<DuckDBPyRelation> FromParquet(const py::object &path_or_buffer, bool binary_as_string,
bool file_row_number, bool filename, bool hive_partitioning,
bool union_by_name, const py::object &compression = py::none());

unique_ptr<DuckDBPyRelation> FromArrow(py::object &arrow_object);

Expand Down
14 changes: 6 additions & 8 deletions src/duckdb_py/path_like.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ namespace duckdb {

struct PathLikeProcessor {
public:
PathLikeProcessor(DuckDBPyConnection &connection, PythonImportCache &import_cache)
: connection(connection), import_cache(import_cache) {
explicit PathLikeProcessor(DuckDBPyConnection &connection) : connection(connection) {
}

public:
Expand All @@ -29,7 +28,6 @@ struct PathLikeProcessor {
public:
DuckDBPyConnection &connection;
optional_ptr<ModifiedMemoryFileSystem> object_store;
PythonImportCache &import_cache;
// The list containing every file
vector<string> all_files;
// The list of files that are registered in the object_store;
Expand All @@ -41,8 +39,10 @@ void PathLikeProcessor::AddFile(const py::object &object) {
all_files.push_back(std::string(py::str(object)));
return;
}
if (py::isinstance(object, import_cache.pathlib.Path())) {
all_files.push_back(std::string(py::str(object)));
if (py::isinstance<py::bytes>(object) || py::hasattr(object, "__fspath__")) {
// A bytes path or an os.PathLike object (e.g. pathlib.Path) - decode it to a string
auto fsdecode = py::module_::import("os").attr("fsdecode");
all_files.push_back(std::string(py::str(fsdecode(object))));
return;
}
// This is (assumed to be) a file-like object
Expand Down Expand Up @@ -79,9 +79,7 @@ PathLike PathLikeProcessor::Finalize() {
}

PathLike PathLike::Create(const py::object &object, DuckDBPyConnection &connection) {
auto &import_cache = *DuckDBPyConnection::ImportCache();

PathLikeProcessor processor(connection, import_cache);
PathLikeProcessor processor(connection);
if (py::isinstance<py::list>(object)) {
auto list = py::list(object);
for (auto &item : list) {
Expand Down
Loading
Loading