diff --git a/core/src/Streamly/Internal/FileSystem/Path/Common.hs b/core/src/Streamly/Internal/FileSystem/Path/Common.hs index 6f2221db19..f4fb96deda 100644 --- a/core/src/Streamly/Internal/FileSystem/Path/Common.hs +++ b/core/src/Streamly/Internal/FileSystem/Path/Common.hs @@ -1337,7 +1337,10 @@ unsafeFromChars :: (Unbox a) => -> Stream Identity Char -> Array a unsafeFromChars encode s = - let n = runIdentity $ Stream.fold Fold.length s + -- The encoded array may be longer than the char count. We are encoding it + -- twice, but it may still be cheaper than reallocating the array or + -- oversizing the array. + let n = runIdentity $ Stream.fold Fold.length (encode s) in Array.fromPureStreamN n (encode s) -- XXX Writing a custom fold for parsing a Posix path may be better for diff --git a/test/Streamly/Test/FileSystem/PosixPath.hs b/test/Streamly/Test/FileSystem/PosixPath.hs index 6e3c59c0cb..db944c2b6e 100644 --- a/test/Streamly/Test/FileSystem/PosixPath.hs +++ b/test/Streamly/Test/FileSystem/PosixPath.hs @@ -48,6 +48,11 @@ testFromString = describe "fromString" $ do str (p "/usr/bin") `shouldBe` "/usr/bin" it "relative roundtrip" $ str (p "a/b/c") `shouldBe` "a/b/c" + -- test correct array size allocation for unicode encoding + it "non-ASCII roundtrip preserves trailing bytes" $ + str (p "\945.txt") `shouldBe` "\945.txt" + it "multi-byte UTF-8 roundtrip (4-byte char)" $ + str (p "\x1F600/file") `shouldBe` "\x1F600/file" ------------------------------------------------------------------------------- -- Separators diff --git a/test/Streamly/Test/FileSystem/WindowsPath.hs b/test/Streamly/Test/FileSystem/WindowsPath.hs index fa5cb46596..ad73863cec 100644 --- a/test/Streamly/Test/FileSystem/WindowsPath.hs +++ b/test/Streamly/Test/FileSystem/WindowsPath.hs @@ -62,6 +62,13 @@ testFromString = describe "fromString" $ do str (p "C:\\Users") `shouldBe` "C:\\Users" it "forward slashes preserved on roundtrip" $ str (p "a/b") `shouldBe` "a/b" + -- test correct array size allocation for unicode encoding + it "non-ASCII (BMP) roundtrip" $ + str (p "\945.txt") `shouldBe` "\945.txt" + -- Non-BMP chars require a UTF-16 surrogate pair (2 words for 1 char), + -- which would be truncated if the array were sized by char count. + it "multi-word UTF-16 roundtrip (non-BMP char)" $ + str (p "\x1F600\\file") `shouldBe` "\x1F600\\file" ------------------------------------------------------------------------------- -- Validation