Skip to content

Commit c13b09b

Browse files
authored
Speed up the "dump" phase (Pass 2) of osm2rdf (#122)
Speed up the "dump" phase (Pass 2) of `osm2rdf` via a number of improvements: 1. Move instead of copy `osmium::Buffer` objects directly into dedicated worker threads and operate on the original `libosmium` objects the entire time. Apart from the copy, this also avoids synchronization overhead (previously, each OSM object was a dedicated task). This was non-trivial to achieve, as several `libosmium` handlers were previously applied iteratively to a finished `osmium::Buffer` object, augmenting the objects in pipeline-like fashion until they were finally passed to the handler that tasked them away to the threads. Several of these handlers assumed that the objects arrive ordered by their OSM id, which is no longer guaranteed when buffers are handled in parallel. We solve this by separating these handlers into a separate internal pass. 2. Replace our internal C-API calls to `zlib` by `zlib-ng`. 3. Improve the handling and formatting of attributes. In particular, avoid copies by operating directly on the raw C-strings from the `osmium::Buffer`. Also drop the use of `strftime` and generate the time strings directly. 4. Cherry-pick the smaller spatialjoin IDs proposed in #108 to avoid full IRI writes to disk. 5. Update `libspatialjoin`, which now folds the ID directly into the sweeper event if it fits into 64 bit (which it does for every OSM node), eliminating the writing and lookup of the ID for points. In total, the changes above **sped up the dump phase for `switzerland-latest.osm.pbf` by a factor of almost 5** (4 min -> 50s), when using `--output-compression gz` and otherwise the default options. On the side, improve the progress bar by "weighting" individual tasks by their estimated duration. In particular, each way is now weighted by the number of its member nodes.
1 parent a9b2995 commit c13b09b

49 files changed

Lines changed: 1265 additions & 2142 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,6 @@
1414
[submodule "vendor/spatialjoin"]
1515
path = vendor/spatialjoin
1616
url = https://github.com/ad-freiburg/spatialjoin.git
17+
[submodule "vendor/zlib-ng/zlib-ng"]
18+
path = vendor/zlib-ng/zlib-ng
19+
url = https://github.com/zlib-ng/zlib-ng.git

CMakeLists.txt

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,18 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
4242

4343
# set(CMAKE_VERBOSE_MAKEFILE ON)
4444

45-
option(ENABLE_GEOMETRY_STATISTIC "Write geometry statistics if enable" 0)
45+
option(ENABLE_GEOMETRY_STATISTIC "Write geometry statistics if enabled" 0)
46+
47+
# Force static libraries
48+
set(BUILD_SHARED_LIBS OFF)
4649

4750
# Enable verbose makefile
4851
if (ENABLE_GEOMETRY_STATISTIC)
4952
# Enable geometry statistics
5053
add_definitions(-DENABLE_GEOMETRY_STATISTIC)
5154
endif()
5255

53-
add_compile_options(-Wall -Wextra -Wno-missing-field-initializers)
56+
add_compile_options(-Wall -Wextra -Wno-missing-field-initializers -Wno-stringop-overread)
5457
add_compile_options(-DGTEST_HAS_TR1_TUPLE=0 -DGTEST_USE_OWN_TR1_TUPLE=0)
5558

5659
add_compile_options(-march=native)
@@ -62,9 +65,18 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address,undefined
6265
# ----------------------------------------------------------------------------
6366
find_package(EXPAT REQUIRED)
6467
find_package(BZip2 REQUIRED)
65-
find_package(ZLIB REQUIRED)
6668
find_package(OpenMP)
6769

70+
# zlib-ng
71+
set(ZLIB_COMPAT ON)
72+
set(WITH_GTEST OFF)
73+
set(ZLIB_ENABLE_TESTS OFF)
74+
add_subdirectory(vendor/zlib-ng/zlib-ng)
75+
76+
set(ZLIB_USE_STATIC_LIBS ON)
77+
set(ZLIB_INCLUDE_DIR "${CMAKE_BINARY_DIR}/vendor/zlib-ng/zlib-ng")
78+
set(ZLIB_LIBRARY "${CMAKE_BINARY_DIR}/vendor/zlib-ng/zlib-ng/libz.a")
79+
6880
# Disable installation of google stuff
6981
set(INSTALL_GMOCK OFF)
7082
set(INSTALL_GTEST OFF)

apps/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# along with osm2rdf. If not, see <https://www.gnu.org/licenses/>.
1818

1919
add_executable(osm2rdf osm2rdf.cpp)
20+
2021
target_link_libraries(osm2rdf PRIVATE osm2rdf_library spatialjoin-dev pb_util)
2122

2223
add_executable(osm2rdf-stats osm2rdf-stats.cpp)

apps/osm2rdf.cpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
#include "osm2rdf/config/ExitCode.h"
2727
#include "osm2rdf/osm/OsmiumHandler.h"
2828
#include "osm2rdf/ttl/Writer.h"
29-
#include "osm2rdf/util/Ram.h"
3029
#include "osm2rdf/util/Time.h"
3130
#include "osmium/util/memory.hpp"
3231

@@ -89,14 +88,6 @@ int main(int argc, char** argv) {
8988
config.fromArgs(argc, argv);
9089
std::cerr << config.getInfo(osm2rdf::util::formattedTimeSpacer) << std::endl;
9190

92-
std::cerr << osm2rdf::util::currentTimeFormatted() << "Free ram: "
93-
<< osm2rdf::util::ram::available() /
94-
(osm2rdf::util::ram::GIGA * 1.0)
95-
<< "G/"
96-
<< osm2rdf::util::ram::physPages() /
97-
(osm2rdf::util::ram::GIGA * 1.0)
98-
<< "G" << std::endl;
99-
10091
#if defined(_OPENMP)
10192
omp_set_num_threads(config.numThreads);
10293
#endif

include/osm2rdf/config/Config.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,6 @@ struct Config {
8282

8383
bool addSpatialRelsForUntaggedNodes = true;
8484

85-
bool separate = true;
86-
8785
std::string iriPrefixForUntaggedNodes =
8886
osm2rdf::ttl::constants::IRI_PREFIX__OSM_NODE_UNTAGGED;
8987

include/osm2rdf/osm/Area.h

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,8 @@ struct Area {
4444
[[nodiscard]] const ::util::geo::DMultiPolygon& geom() const noexcept;
4545
[[nodiscard]] double geomArea() const noexcept;
4646
[[nodiscard]] const ::util::geo::DBox& envelope() const noexcept;
47-
[[nodiscard]] double envelopeArea() const noexcept;
48-
[[nodiscard]] const ::util::geo::DPolygon& convexHull() const noexcept;
49-
[[nodiscard]] const ::util::geo::DPolygon& orientedBoundingBox() const noexcept;
47+
[[nodiscard]] ::util::geo::DPolygon convexHull() const noexcept;
48+
[[nodiscard]] ::util::geo::DPolygon orientedBoundingBox() const noexcept;
5049
[[nodiscard]] const ::util::geo::DPoint centroid() const noexcept;
5150
[[nodiscard]] bool fromWay() const noexcept;
5251
[[nodiscard]] bool hasName() const noexcept;
@@ -61,13 +60,9 @@ struct Area {
6160
id_t _id;
6261
// The OSM id
6362
id_t _objId;
64-
bool _hasName = false;
6563
double _geomArea = 0;
66-
double _envelopeArea = 0;
6764
::util::geo::DMultiPolygon _geom;
6865
::util::geo::DBox _envelope;
69-
::util::geo::DPolygon _convexHull;
70-
::util::geo::DPolygon _obb;
7166
};
7267

7368
} // namespace osm2rdf::osm

include/osm2rdf/osm/CountHandler.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ class CountHandler : public osmium::handler::Handler {
3535
size_t numNodes() const;
3636
size_t numRelations() const;
3737
size_t numWays() const;
38+
size_t weightedNumWays() const;
39+
size_t weightedNumRelations() const;
3840

3941
size_t minNodeId() const { return _minId; };
4042
size_t maxNodeId() const { return _maxId; };
@@ -43,6 +45,8 @@ class CountHandler : public osmium::handler::Handler {
4345
size_t _numNodes = 0;
4446
size_t _numRelations = 0;
4547
size_t _numWays = 0;
48+
size_t _weightedNumWays = 0;
49+
size_t _weightedNumRelations = 0;
4650
bool _firstPassDone = false;
4751
size_t _minId = std::numeric_limits<size_t>::max();
4852
size_t _maxId = 0;

include/osm2rdf/osm/FactHandler.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,24 +47,27 @@ class FactHandler {
4747
void setLocationHandler(osm2rdf::osm::LocationHandler* locationHandler);
4848
// Add data
4949
void area(const osm2rdf::osm::Area& area);
50-
void node(const osm2rdf::osm::Node& node);
50+
void node(const osmium::Node& node);
5151
void relation(const osm2rdf::osm::Relation& relation);
5252
void way(const osm2rdf::osm::Way& way);
5353

54-
template <typename G>
5554
void writeGeometry(const std::string& s, const std::string& p,
56-
const G& g);
55+
const ::util::geo::DPoint& g);
56+
template <typename G>
57+
void writeGeometry(const std::string& s, const std::string& p, const G& g);
5758

5859
protected:
5960
void writeBox(const std::string& s, const std::string& p,
6061
const ::util::geo::DBox& box);
6162
FRIEND_TEST(OSM_FactHandler, writeBoxPrecision1);
6263
FRIEND_TEST(OSM_FactHandler, writeBoxPrecision2);
6364

65+
void writeMeta(const std::string& s, const osmium::Node& object);
66+
6467
template <typename T>
6568
void writeMeta(const std::string& s, const T& object);
6669

67-
void writeTag(const std::string& s, const osm2rdf::osm::Tag& tag);
70+
void writeTag(const std::string& s, const char* key, const char* val);
6871
FRIEND_TEST(OSM_FactHandler, writeTag_AdminLevel);
6972
FRIEND_TEST(OSM_FactHandler, writeTag_AdminLevel_nonInteger);
7073
FRIEND_TEST(OSM_FactHandler, writeTag_AdminLevel_nonInteger2);
@@ -77,7 +80,7 @@ class FactHandler {
7780
FRIEND_TEST(OSM_FactHandler, writeTag_KeyIRI);
7881
FRIEND_TEST(OSM_FactHandler, writeTag_KeyNotIRI);
7982

80-
void writeTagList(const std::string& s, const osm2rdf::osm::TagList& tags);
83+
void writeTagList(const std::string& s, const osmium::TagList& tags);
8184
FRIEND_TEST(OSM_FactHandler, writeTagList);
8285
FRIEND_TEST(OSM_FactHandler, writeTagListWikidata);
8386
FRIEND_TEST(OSM_FactHandler, writeTagListRefSingle);
@@ -106,10 +109,15 @@ class FactHandler {
106109
FRIEND_TEST(OSM_FactHandler, writeTagListStartDateYearMonthDay5);
107110

108111
bool hasSuffix(const std::string& s, const std::string& suffix) const;
112+
bool hasSuffix(const char* s, const char* suffix, size_t suffixSize) const;
109113

110114
const osm2rdf::config::Config _config;
111115
osm2rdf::ttl::Writer<W>* _writer;
112116
osm2rdf::osm::LocationHandler* _locationHandler;
117+
bool _separateUntaggedNodePrefixes = false;
118+
std::string _datasetId, _relNamespace, _wayNamespace, _changesetNamespace,
119+
_iriXSDDouble, _iriWKTLiteral, _iriXSDInteger, _tagTripleCountIRI,
120+
_areaIRI;
113121
};
114122

115123
} // namespace osm2rdf::osm

include/osm2rdf/osm/GeometryHandler.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ class GeometryHandler {
5050

5151
// Add data
5252
void area(const osm2rdf::osm::Area& area);
53-
void node(const osm2rdf::osm::Node& node);
53+
void node(const osmium::Node& node);
5454
void relation(const osm2rdf::osm::Relation& relation);
5555
void way(const osm2rdf::osm::Way& way);
5656

@@ -69,8 +69,6 @@ class GeometryHandler {
6969
sj::Sweeper _sweeper;
7070
std::vector<sj::WriteBatch> _parseBatches;
7171

72-
std::string areaNS(AreaFromType type) const;
73-
7472
static ::util::geo::I32Point transform(const ::util::geo::DPoint& loc);
7573

7674
static ::util::geo::I32Box transform(const ::util::geo::DBox& box);
@@ -79,11 +77,15 @@ class GeometryHandler {
7977
static ::util::geo::I32MultiPolygon transform(
8078
const ::util::geo::DMultiPolygon& area);
8179

82-
void writeRelCb(size_t t, const std::string& a, const std::string& b,
83-
const std::string& pred);
80+
std::string getSweeperId(uint64_t oid, char type);
81+
std::string getFullID(const char* id, size_t n);
82+
83+
void writeRelCb(size_t t, const char* a, size_t an, const char* b, size_t bn,
84+
const char* pred, size_t predn);
8485
void progressCb(size_t progr);
8586

8687
osm2rdf::util::ProgressBar _progressBar;
88+
bool _separateUntaggedNodePrefixes = false;
8789
};
8890

8991
} // namespace osm2rdf::osm

include/osm2rdf/osm/LocationHandler.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ class LocationHandler : public osmium::handler::Handler {
4141
virtual ~LocationHandler() {}
4242
virtual void node(const osmium::Node& node) = 0;
4343
virtual void way(osmium::Way& way) = 0;
44+
virtual void finalizeNodes() = 0;
4445
[[nodiscard]] virtual osmium::Location get_node_location(
4546
const osmium::object_id_type id) const = 0;
4647
[[nodiscard]] virtual bool get_node_is_tagged(
@@ -57,6 +58,7 @@ class LocationHandlerImpl : public LocationHandler {
5758
size_t nodeIdMin, size_t nodeIdMax);
5859
void node(const osmium::Node& node);
5960
void way(osmium::Way& way);
61+
void finalizeNodes() {_nodesFinalized = true; };
6062
[[nodiscard]] osmium::Location get_node_location(
6163
const osmium::object_id_type nodeId) const;
6264
[[nodiscard]] bool get_node_is_tagged(
@@ -65,6 +67,7 @@ class LocationHandlerImpl : public LocationHandler {
6567
protected:
6668
T _index;
6769
osm2rdf::osm::handler::NodeLocationsForWays<T> _handler;
70+
bool _nodesFinalized = false;
6871
};
6972

7073
template <>
@@ -76,6 +79,7 @@ class LocationHandlerImpl<osmium::index::map::SparseFileArray<
7679
size_t nodeIdMin, size_t nodeIdMax);
7780
void node(const osmium::Node& node);
7881
void way(osmium::Way& way);
82+
void finalizeNodes() {_nodesFinalized = true; };
7983
[[nodiscard]] osmium::Location get_node_location(
8084
const osmium::object_id_type nodeId) const;
8185
[[nodiscard]] bool get_node_is_tagged(
@@ -90,6 +94,7 @@ class LocationHandlerImpl<osmium::index::map::SparseFileArray<
9094
osmium::index::map::SparseFileArray<osmium::unsigned_object_id_type,
9195
osm2rdf::osm::Location>>
9296
_handler;
97+
bool _nodesFinalized = false;
9398
};
9499

95100
template <>
@@ -101,6 +106,7 @@ class LocationHandlerImpl<osmium::index::map::DenseFileArray<
101106
size_t nodeIdMin, size_t nodeIdMax);
102107
void node(const osmium::Node& node);
103108
void way(osmium::Way& way);
109+
void finalizeNodes() {_nodesFinalized = true; };
104110
[[nodiscard]] osmium::Location get_node_location(
105111
const osmium::object_id_type nodeId) const;
106112
[[nodiscard]] bool get_node_is_tagged(
@@ -115,6 +121,7 @@ class LocationHandlerImpl<osmium::index::map::DenseFileArray<
115121
osmium::index::map::DenseFileArray<osmium::unsigned_object_id_type,
116122
osm2rdf::osm::Location>>
117123
_handler;
124+
bool _nodesFinalized = false;
118125
};
119126

120127
template <>
@@ -126,6 +133,7 @@ class LocationHandlerImpl<osm2rdf::osm::DenseMemIndex<
126133
size_t nodeIdMin, size_t nodeIdMax);
127134
void node(const osmium::Node& node);
128135
void way(osmium::Way& way);
136+
void finalizeNodes() {_nodesFinalized = true; };
129137
[[nodiscard]] osmium::Location get_node_location(
130138
const osmium::object_id_type nodeId) const;
131139
[[nodiscard]] bool get_node_is_tagged(
@@ -138,6 +146,7 @@ class LocationHandlerImpl<osm2rdf::osm::DenseMemIndex<
138146
osm2rdf::osm::handler::NodeLocationsForWays<osm2rdf::osm::DenseMemIndex<
139147
osmium::unsigned_object_id_type, osm2rdf::osm::Location>>
140148
_handler;
149+
bool _nodesFinalized = false;
141150
};
142151

143152
using LocationHandlerRAMDense = LocationHandlerImpl<osm2rdf::osm::DenseMemIndex<

0 commit comments

Comments
 (0)