Skip to content

Commit a15bc06

Browse files
committed
Feature Storage complete with tests, almost ready for V0.2
1 parent c73df13 commit a15bc06

7 files changed

Lines changed: 485 additions & 13 deletions

File tree

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ benchmark/dataset
55

66
*.exe
77
*.gl
8+
*.gd
89
*.zip
910
*.npz
1011
benchmark/*.csv
1112

12-
*_pycache__/
13+
*_pycache__/
14+
.pytest_cache/

graphzero/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from .graphzero import Graph, convert_csv_to_gl
1+
from .graphzero import Graph, convert_csv_to_gl, convert_csv_to_gd, DataType, FeatureStore
22

33
# Metadata
44
__version__ = "0.1.2"

src/FeatureStore.hpp

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#ifndef FEATURE_STORE_H
2+
#define FEATURE_STORE_H
3+
#include <cstdint>
4+
#include <stdexcept>
5+
#include "MemoryMap.hpp"
6+
#include <span>
7+
8+
enum class DataType : int32_t {
9+
INT32 = 0,
10+
INT64 = 1,
11+
FLOAT32 = 2,
12+
FLOAT64 = 3
13+
};
14+
15+
#pragma pack(push,1) // to complier to use 1-byte alignment(no padding)
16+
struct FeatureHeader
17+
{
18+
char magic[8]; // 8bytes "GZDATA26"
19+
uint32_t flags; // 4bytes
20+
DataType dtype; // 4bytes
21+
uint64_t num_nodes; // 8bytes
22+
uint64_t feature_dim; // 8bytes
23+
}; // 32 bytes aligned
24+
#pragma pack(pop) // restore default compiler alignment
25+
26+
class FeatureStore {
27+
private:
28+
MemoryMap* fileMap;
29+
struct FeatureHeader header;
30+
char* data_ptr;
31+
public:
32+
int64_t num_nodes;
33+
int64_t feature_dim;
34+
std::string filename;
35+
36+
FeatureStore(const char* filename);
37+
~FeatureStore();
38+
39+
DataType get_dtype() const;
40+
char* get_data_ptr() const; // 1byte pointer
41+
42+
template <typename T>
43+
std::span<T> get_data(int64_t nodeId);
44+
};
45+
46+
inline FeatureStore::FeatureStore(const char* filename){
47+
this->filename = std::string(filename);
48+
this->fileMap = new MemoryMap(filename);
49+
this->header = reinterpret_cast<FeatureHeader*>(this->fileMap->get_data())[0];
50+
if (std::string_view(header.magic, 8) != "GZDATA26") {
51+
throw std::runtime_error("Corrupted or invalid .gd file! Magic bytes mismatch.");
52+
}
53+
this->num_nodes = this->header.num_nodes;
54+
this->feature_dim = this->header.feature_dim;
55+
this->data_ptr = reinterpret_cast<char*>(this->fileMap->get_data()) + sizeof(FeatureHeader);
56+
}
57+
58+
inline FeatureStore::~FeatureStore(){
59+
this->data_ptr = nullptr;
60+
delete this->fileMap;
61+
}
62+
63+
inline DataType FeatureStore::get_dtype() const{
64+
return this->header.dtype;
65+
}
66+
67+
inline char* FeatureStore::get_data_ptr() const {
68+
return this->data_ptr;
69+
}
70+
71+
template <typename T>
72+
inline std::span<T> FeatureStore::get_data(int64_t nodeId){
73+
if(nodeId >= num_nodes){
74+
throw std::runtime_error("NodeId is greater than number of nodes.");
75+
}
76+
int64_t offset = nodeId * feature_dim; // start of data of nodeId
77+
T* p = reinterpret_cast<T*>(this->data_ptr) + offset;
78+
return std::span<T>(p,this->feature_dim);
79+
}
80+
#endif

src/bindings.cpp

Lines changed: 156 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#include <nanobind/stl/string.h>
66
#include "Graphzero.hpp"
77
#include "csrFilegen.hpp"
8+
#include "featureFilegen.hpp"
9+
#include "FeatureStore.hpp"
810
#include <vector>
911
namespace nb = nanobind;
1012

@@ -23,9 +25,9 @@ It holds the mmap / zero-copy memory.
2325
)doc",
2426
nb::arg("filename")
2527
)
26-
.def_rw("num_nodes",&Graphzero::num_nodes)
27-
.def_rw("num_edges",&Graphzero::num_edges)
28-
.def_rw("has_weights",&Graphzero::has_weights)
28+
.def_ro("num_nodes",&Graphzero::num_nodes)
29+
.def_ro("num_edges",&Graphzero::num_edges)
30+
.def_ro("has_weights",&Graphzero::has_weights)
2931

3032

3133
.def("get_degree", [](Graphzero &self, int64_t node_id) {
@@ -215,9 +217,159 @@ R"doc(Performs uniform random neighbour sampling for a node.
215217

216218
// convert csv to gl
217219
m.def("convert_csv_to_gl", &convert_csv,
218-
"Convert a CSV edge list to GraphZero binary format (.gl)",
220+
R"doc(Convert a CSV edge list to GraphZero binary format (.gl))doc",
219221
nb::arg("csv_path"),
220222
nb::arg("out_path"),
221223
nb::arg("directed") = false,
222224
nb::call_guard<nb::gil_scoped_release>());
225+
226+
// Feature store class
227+
228+
// Bind the DataType Enum so Python can pass it to the converter
229+
nb::enum_<DataType>(m, "DataType")
230+
.value("INT32", DataType::INT32)
231+
.value("INT64", DataType::INT64)
232+
.value("FLOAT32", DataType::FLOAT32)
233+
.value("FLOAT64", DataType::FLOAT64)
234+
.export_values();
235+
236+
nb::class_<FeatureStore>(m, "FeatureStore")
237+
.def(nb::init<const char*>(), // constructor
238+
R"doc(Data Class contains the Datafile and its relevant functions and methods.
239+
It holds the mmap / zero-copy memory.
240+
Args:
241+
filename (str): either absolute path or relative path (depends on the current working directory).
242+
Returns:
243+
FeatureStorage instance.
244+
)doc",
245+
nb::arg("filename")
246+
)
247+
.def_ro("num_nodes",&FeatureStore::num_nodes)
248+
.def_ro("feature_dim",&FeatureStore::feature_dim)
249+
.def("get_data",[](FeatureStore &self,int64_t nodeId) -> nb::object {
250+
switch(self.get_dtype()){
251+
case DataType::INT32: {
252+
auto data = self.get_data<int32_t>(nodeId);
253+
auto arr = nb::ndarray<nb::numpy, int32_t, nb::shape<1>>(
254+
const_cast<int32_t*>(data.data()), // pointer to data
255+
{ data.size() } // shape
256+
);
257+
return nb::cast(arr);
258+
}
259+
case DataType::INT64: {
260+
261+
auto data = self.get_data<int64_t>(nodeId);
262+
auto arr = nb::ndarray<nb::numpy, int64_t, nb::shape<1>>(
263+
const_cast<int64_t*>(data.data()), // pointer to data
264+
{ data.size() } // shape
265+
);
266+
return nb::cast(arr);
267+
}
268+
case DataType::FLOAT32: {
269+
auto data = self.get_data<float>(nodeId);
270+
auto arr = nb::ndarray<nb::numpy, float, nb::shape<1>>(
271+
const_cast<float*>(data.data()), // pointer to data
272+
{ data.size() } // shape
273+
);
274+
return nb::cast(arr);
275+
}
276+
case DataType::FLOAT64: {
277+
auto data = self.get_data<double>(nodeId);
278+
auto arr = nb::ndarray<nb::numpy, double, nb::shape<1>>(
279+
const_cast<double*>(data.data()), // pointer to data
280+
{ data.size() } // shape
281+
);
282+
return nb::cast(arr);
283+
}
284+
default: throw std::runtime_error("Unsupported data type");
285+
}
286+
287+
// Return a zero-copy view into the underlying data buffer and keep
288+
},
289+
nb::keep_alive<0,1>(),
290+
R"doc(Returns the data of a node.
291+
Args:
292+
node_id (int)
293+
Returns:
294+
1-D numpy ndarray of data for nodeId.
295+
)doc",
296+
nb::arg("node_id")
297+
)
298+
.def("get_tensor",[](FeatureStore &self) -> nb::object {
299+
char* data = self.get_data_ptr();
300+
size_t n = self.num_nodes;
301+
size_t f = self.feature_dim;
302+
303+
switch(self.get_dtype()){
304+
case DataType::INT32: {
305+
int32_t* ptr = reinterpret_cast<int32_t*>(data);
306+
307+
auto arr = nb::ndarray<nb::numpy, int32_t, nb::shape<2>>(
308+
ptr, // pointer to data
309+
{n,f } // shape
310+
);
311+
return nb::cast(arr);
312+
}
313+
case DataType::INT64: {
314+
int64_t* ptr = reinterpret_cast<int64_t*>(data);
315+
316+
auto arr = nb::ndarray<nb::numpy, int64_t, nb::shape<2>>(
317+
ptr, // pointer to data
318+
{n,f } // shape
319+
);
320+
return nb::cast(arr);
321+
}
322+
case DataType::FLOAT32: {
323+
float* ptr = reinterpret_cast<float*>(data);
324+
325+
auto arr = nb::ndarray<nb::numpy, float, nb::shape<2>>(
326+
ptr, // pointer to data
327+
{n,f } // shape
328+
);
329+
return nb::cast(arr);
330+
}
331+
case DataType::FLOAT64: {
332+
double* ptr = reinterpret_cast<double*>(data);
333+
334+
auto arr = nb::ndarray<nb::numpy, double, nb::shape<2>>(
335+
ptr, // pointer to data
336+
{n,f } // shape
337+
);
338+
return nb::cast(arr);
339+
}
340+
default: throw std::runtime_error("Unsupported data type");
341+
}
342+
},
343+
nb::keep_alive<0,1>(),
344+
R"doc(Returns the entire Data (tensor).
345+
Returns:
346+
NxF data of given dtype.
347+
)doc"
348+
)
349+
350+
// serialization (Pack)
351+
.def("__getstate__", [](const FeatureStore &d){
352+
353+
return nb::make_tuple(d.filename); // only filename required to rebuild the object
354+
})
355+
// deserialization (unpack)
356+
.def("__setstate__",[](nb::tuple &t){
357+
358+
if (t.size() != 1)
359+
throw std::runtime_error("Invalid state!");
360+
361+
std::string filename = nb::cast<std::string>(t[0]);
362+
363+
// create new c++ object using the filename
364+
return new FeatureStore(filename.c_str());
365+
})
366+
;
367+
368+
//feature store data
369+
m.def("convert_csv_to_gd", &convert_csv_to_binary,
370+
R"doc(Convert a CSV data to GraphZero data format (.gd).)doc",
371+
nb::arg("csv_path"),
372+
nb::arg("out_path"),
373+
nb::arg("dtype"),
374+
nb::call_guard<nb::gil_scoped_release>());
223375
}

0 commit comments

Comments
 (0)