From 61e545442f3c6fcac6f0c1e459a1fcd2d77f6175 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sat, 4 Apr 2026 13:49:02 +0200 Subject: [PATCH 1/3] oldms: string reading utility --- src/odr/internal/oldms/word/io.cpp | 97 +++++++++++++++++++++++++- src/odr/internal/oldms/word/io.hpp | 5 ++ src/odr/internal/util/string_util.cpp | 6 +- src/odr/internal/util/string_util.hpp | 1 + test/src/internal/oldms/oldms_test.cpp | 4 +- 5 files changed, 108 insertions(+), 5 deletions(-) diff --git a/src/odr/internal/oldms/word/io.cpp b/src/odr/internal/oldms/word/io.cpp index e04415397..7d476e8d2 100644 --- a/src/odr/internal/oldms/word/io.cpp +++ b/src/odr/internal/oldms/word/io.cpp @@ -1,7 +1,9 @@ #include +#include "odr/internal/util/string_util.hpp" + #include -#include +#include namespace odr::internal::oldms { @@ -190,4 +192,97 @@ void oldms::skip_Prc(std::istream &in) { in.ignore(cbGrpprl); } +std::string oldms::read_string_compressed(std::istream &in, + const std::size_t size) { + static constexpr auto eof = std::istream::traits_type::eof(); + + std::string result; + result.reserve(size); + + for (std::size_t i = 0; i < size; ++i) { + const auto ci = in.get(); + if (ci == eof) { + throw std::runtime_error("Unexpected end of input"); + } + if (ci < 0 || ci > 0xFF) { + throw std::runtime_error("Unexpected input: " + std::to_string(ci)); + } + const char c = static_cast(ci); + if (const std::optional uncompressed = uncompress_char(c); + uncompressed.has_value()) { + util::string::append_c32(*uncompressed, result); + } else { + result.push_back(c); + } + } + + return result; +} + +std::u16string oldms::read_string_uncompressed(std::istream &in, + const std::size_t size) { + std::u16string result; + result.resize(size); + + in.read(reinterpret_cast(result.data()), + static_cast(size * sizeof(char16_t))); + + return result; +} + +std::optional oldms::uncompress_char(const char c) { + switch (c) { + case '\x82': + return 0x201A; + case '\x83': + return 0x0192; + case '\x84': + return 0x201E; + case '\x85': + return 0x2026; + case '\x86': + return 0x2020; + case '\x87': + return 0x2021; + case '\x88': + return 0x02C6; + case '\x89': + return 0x2030; + case '\x8A': + return 0x0160; + case '\x8B': + return 0x2039; + case '\x8C': + return 0x0152; + case '\x91': + return 0x2018; + case '\x92': + return 0x2019; + case '\x93': + return 0x201C; + case '\x94': + return 0x201D; + case '\x95': + return 0x2022; + case '\x96': + return 0x2013; + case '\x97': + return 0x2014; + case '\x98': + return 0x02DC; + case '\x99': + return 0x2122; + case '\x9A': + return 0x0161; + case '\x9B': + return 0x203A; + case '\x9C': + return 0x0153; + case '\x9F': + return 0x0178; + default: + return std::nullopt; + } +} + } // namespace odr::internal diff --git a/src/odr/internal/oldms/word/io.hpp b/src/odr/internal/oldms/word/io.hpp index b3358b649..88657a2a9 100644 --- a/src/odr/internal/oldms/word/io.hpp +++ b/src/odr/internal/oldms/word/io.hpp @@ -30,4 +30,9 @@ void read_Clx(std::istream &in, const HandlePrc &handle_Prc, const HandlePcdt &handle_Pcdt); void skip_Prc(std::istream &in); +std::string read_string_compressed(std::istream &in, std::size_t size); +std::u16string read_string_uncompressed(std::istream &in, std::size_t size); + +std::optional uncompress_char(char c); + } // namespace odr::internal::oldms diff --git a/src/odr/internal/util/string_util.cpp b/src/odr/internal/util/string_util.cpp index 4975711dd..07cbcca04 100644 --- a/src/odr/internal/util/string_util.cpp +++ b/src/odr/internal/util/string_util.cpp @@ -1,9 +1,7 @@ #include #include -#include #include -#include #include #include @@ -85,4 +83,8 @@ std::string string::c16str_to_string(const char16_t *c16str, return u16string_to_string(std::u16string(c16str, length / 2)); } +void string::append_c32(const char32_t c, std::string &string) { + utf8::append(c, string); +} + } // namespace odr::internal::util diff --git a/src/odr/internal/util/string_util.hpp b/src/odr/internal/util/string_util.hpp index 71869fef1..d91e2525e 100644 --- a/src/odr/internal/util/string_util.hpp +++ b/src/odr/internal/util/string_util.hpp @@ -26,5 +26,6 @@ std::string to_string(double d, int precision); std::string u16string_to_string(const std::u16string &string); std::u16string string_to_u16string(const std::string &string); std::string c16str_to_string(const char16_t *c16str, std::size_t length); +void append_c32(char32_t c, std::string &string); } // namespace odr::internal::util::string diff --git a/test/src/internal/oldms/oldms_test.cpp b/test/src/internal/oldms/oldms_test.cpp index a74202932..aeb9bb054 100644 --- a/test/src/internal/oldms/oldms_test.cpp +++ b/test/src/internal/oldms/oldms_test.cpp @@ -78,8 +78,8 @@ TEST(OldMs, test) { const auto document_stream = files.open("/WordDocument").stream(); document_stream->seekg(first_text_offset); - const std::string first_text = - internal::util::stream::read(*document_stream, first_text_length); + const std::string first_text = internal::oldms::read_string_compressed( + *document_stream, first_text_length); std::cout << "first_text " << first_text << std::endl; }); } From 9a4f05c6fc39f6c052fececc9608786862c996fe Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sat, 4 Apr 2026 14:19:57 +0200 Subject: [PATCH 2/3] fix includes --- src/odr/internal/oldms/word/io.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/odr/internal/oldms/word/io.hpp b/src/odr/internal/oldms/word/io.hpp index 88657a2a9..9677fbe43 100644 --- a/src/odr/internal/oldms/word/io.hpp +++ b/src/odr/internal/oldms/word/io.hpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace odr::internal::oldms { From 724ea5a20b3d4a344b79b823d7cb00cb102cec81 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Sat, 4 Apr 2026 14:30:57 +0200 Subject: [PATCH 3/3] fix includes --- src/odr/internal/util/string_util.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/odr/internal/util/string_util.cpp b/src/odr/internal/util/string_util.cpp index 07cbcca04..74540643c 100644 --- a/src/odr/internal/util/string_util.cpp +++ b/src/odr/internal/util/string_util.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include