From dfc06b0a6d104f753ca6c341e1ce62f459449fc6 Mon Sep 17 00:00:00 2001 From: Jark Wu Date: Mon, 25 Aug 2025 17:31:09 +0800 Subject: [PATCH 001/287] Initial Commit --- fluss-rust/.DS_Store | Bin 0 -> 6148 bytes fluss-rust/.asf.yaml | 45 ++++ fluss-rust/.github/.DS_Store | Bin 0 -> 6148 bytes fluss-rust/.github/ISSUE_TEMPLATE/bug.yml | 58 +++++ fluss-rust/.github/ISSUE_TEMPLATE/config.yml | 19 ++ fluss-rust/.github/ISSUE_TEMPLATE/feature.yml | 58 +++++ fluss-rust/.github/ISSUE_TEMPLATE/task.yml | 51 +++++ fluss-rust/.github/PULL_REQUEST_TEMPLATE.md | 41 ++++ fluss-rust/.gitignore | 19 ++ fluss-rust/DISCLAIMER | 10 + fluss-rust/LICENSE | 201 ++++++++++++++++++ fluss-rust/NOTICE | 5 + fluss-rust/README.md | 138 ++++++++++++ 13 files changed, 645 insertions(+) create mode 100644 fluss-rust/.DS_Store create mode 100644 fluss-rust/.asf.yaml create mode 100644 fluss-rust/.github/.DS_Store create mode 100644 fluss-rust/.github/ISSUE_TEMPLATE/bug.yml create mode 100644 fluss-rust/.github/ISSUE_TEMPLATE/config.yml create mode 100644 fluss-rust/.github/ISSUE_TEMPLATE/feature.yml create mode 100644 fluss-rust/.github/ISSUE_TEMPLATE/task.yml create mode 100644 fluss-rust/.github/PULL_REQUEST_TEMPLATE.md create mode 100644 fluss-rust/.gitignore create mode 100644 fluss-rust/DISCLAIMER create mode 100644 fluss-rust/LICENSE create mode 100644 fluss-rust/NOTICE create mode 100644 fluss-rust/README.md diff --git a/fluss-rust/.DS_Store b/fluss-rust/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..aa422df10c54b59dcb4bd4dcfc37bcac4de3fc93 GIT binary patch literal 6148 zcmeHKO>fgc5S>j!VpAdI08%eVmbg|SK&gbdIA4cK1*Euw1E3J&sI_ptRpO9C6v=1! zFbDwAuj%c2 z@1rxljEYIInD-{Lw_H2XI*TvkN&GsUF2}9AeVrGRG@s54Ax)-8dG{vG6TR%|MV`zJ zH?kd`@A>1_-fH!<)7cLW4iDD*!Rpy#j-MQ^*S>e}!NcdH@u%!O*I%rZ3cosIcN|{A zR|J(+eU4^%rt?$uFzYrA=!hcf(=l}sy`UkDjxzl*&dkdB57C9Ue;Px?BOW8OkIa`~ z4lCK;D%lNWy`mG#*vwV;E4e<6FlrZb;(4^`oRSKIS9EnQsEUaKqJSuHZ3W!!>Fr*7 z1R;t7qQFg6fcJ+0&KNo@Et;(Zg}DL%>u5HHm_MDE6F3YVmKM2QQ$@? zpz4EY(8H44*}AnjK5GN`2RIw&l@@PUs5zR@h88s8 z1WxCo+3_D4ptD$i!>YK=9b4$y4D{mF8 z8}D9@yu?q21K00gQLiOs6ijVDxCpzWc4_TU#)%)s-M&f)J6#O9z6|4z9Jz8BcLplg z(*st)Dzr=G@p!jdt=P4CZBntvd)s91)F+dIwZ5^rf7X789%K2ei52)CRI*}l3NL8< zxW^~2A4f92!?@`j(_BVofEi#07LfsS*IC6y+#K(T8DIu}#Q@z8HY%ZGFx9B84s7W2 zk;Zd`BxuuHg3vbT7)&+d2#V0Dh&ol6Cx+1J=(kOrV=&dI(?O_}aUQd>FfSCLR!6_B z!a+D1xn%~JfmsH!x?7_9-~9glKbyooW`G%3C`lBun8o12 literal 0 HcmV?d00001 diff --git a/fluss-rust/.github/ISSUE_TEMPLATE/bug.yml b/fluss-rust/.github/ISSUE_TEMPLATE/bug.yml new file mode 100644 index 0000000000..43fbc90d8b --- /dev/null +++ b/fluss-rust/.github/ISSUE_TEMPLATE/bug.yml @@ -0,0 +1,58 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +name: Bug report 🐞 +description: Problems, bugs and issues with Fluss +type: "bug" +body: + - type: markdown + attributes: + value: | + Thank you very much for your feedback! + - type: checkboxes + attributes: + label: Search before asking + description: > + Please search [issues](https://github.com/apache/fluss-benchmarks/issues) to check if your issue has already been reported. + options: + - label: > + I searched in the [issues](https://github.com/apache/fluss-benchmarks/issues) and found nothing similar. + required: true + - type: textarea + attributes: + label: Please describe the bug 🐞 + description: > + Please describe the problem, what to expect, and how to reproduce. + Feel free to include stacktraces and the Fluss server/client configuration. + You can include files by dragging and dropping them here. + validations: + required: true + - type: textarea + attributes: + label: Solution + description: Describe the proposed solution about how to fix it if any. + - type: checkboxes + attributes: + label: Are you willing to submit a PR? + description: > + We look forward to the community of developers or users helping solve Fluss problems together. If you are willing to submit a PR to fix this problem, please check the box. + options: + - label: I'm willing to submit a PR! + - type: markdown + attributes: + value: "Thanks for completing our form!" diff --git a/fluss-rust/.github/ISSUE_TEMPLATE/config.yml b/fluss-rust/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..c2520da489 --- /dev/null +++ b/fluss-rust/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,19 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +blank_issues_enabled: false diff --git a/fluss-rust/.github/ISSUE_TEMPLATE/feature.yml b/fluss-rust/.github/ISSUE_TEMPLATE/feature.yml new file mode 100644 index 0000000000..fc7357165d --- /dev/null +++ b/fluss-rust/.github/ISSUE_TEMPLATE/feature.yml @@ -0,0 +1,58 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +name: Feature Request 🚀 +description: User-facing functionality or improvement you’d like to see added +type: "feature" +body: + - type: markdown + attributes: + value: | + Thank you very much for your feature proposal! + - type: checkboxes + attributes: + label: Search before asking + description: > + Please search [issues](https://github.com/apache/fluss-benchmarks/issues) to check if your issue has already been reported. + options: + - label: > + I searched in the [issues](https://github.com/apache/fluss-benchmarks/issues) and found nothing similar. + required: true + - type: textarea + attributes: + label: Motivation + description: Please describe the feature and elaborate on the use case and motivation behind it + validations: + required: true + - type: textarea + attributes: + label: Solution + description: Describe the proposed solution and add related materials like links if any. + - type: textarea + attributes: + label: Anything else? + - type: checkboxes + attributes: + label: Willingness to contribute + description: > + We look forward to the community of developers or users helping develop Fluss features together. If you are willing to submit a PR to implement the feature, please check the box. + options: + - label: I'm willing to submit a PR! + - type: markdown + attributes: + value: "Thanks for completing our form!" diff --git a/fluss-rust/.github/ISSUE_TEMPLATE/task.yml b/fluss-rust/.github/ISSUE_TEMPLATE/task.yml new file mode 100644 index 0000000000..70b5369a45 --- /dev/null +++ b/fluss-rust/.github/ISSUE_TEMPLATE/task.yml @@ -0,0 +1,51 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +name: Task 📌 +description: Specific work item - either part of a larger feature or independent project maintenance +type: "task" +body: + - type: markdown + attributes: + value: | + Thank you very much for your work! + - type: checkboxes + attributes: + label: Search before asking + description: > + Please search [issues](https://github.com/apache/fluss-benchmarks/issues) to check if your issue has already been reported. + options: + - label: > + I searched in the [issues](https://github.com/apache/fluss-benchmarks/issues) and found nothing similar. + required: true + - type: textarea + attributes: + label: Description + description: Please describe the task and the purpose of the work. + validations: + required: true + - type: checkboxes + attributes: + label: Willingness to contribute + description: > + We look forward to the community of developers or users helping develop Fluss together. If you are willing to submit a PR to implement the task, please check the box. + options: + - label: I'm willing to submit a PR! + - type: markdown + attributes: + value: "Thanks for completing our form!" diff --git a/fluss-rust/.github/PULL_REQUEST_TEMPLATE.md b/fluss-rust/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000..5e03d8df9c --- /dev/null +++ b/fluss-rust/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,41 @@ + + +### Purpose + + +Linked issue: close #xxx + + + +### Brief change log + + + +### Tests + + + +### API and Format + + + +### Documentation + + diff --git a/fluss-rust/.gitignore b/fluss-rust/.gitignore new file mode 100644 index 0000000000..0e51e8099b --- /dev/null +++ b/fluss-rust/.gitignore @@ -0,0 +1,19 @@ +# Generated by Cargo +# will have compiled files and executables +debug/ +target/ +Cargo.lock + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb + +# RustRover +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ +.vscode/ \ No newline at end of file diff --git a/fluss-rust/DISCLAIMER b/fluss-rust/DISCLAIMER new file mode 100644 index 0000000000..ddc4f01632 --- /dev/null +++ b/fluss-rust/DISCLAIMER @@ -0,0 +1,10 @@ +Apache Fluss (incubating) is an effort undergoing incubation at The Apache +Software Foundation (ASF), sponsored by the Apache Incubator PMC. + +Incubation is required of all newly accepted projects until a further review +indicates that the infrastructure, communications, and decision making process +have stabilized in a manner consistent with other successful ASF projects. + +While incubation status is not necessarily a reflection of the completeness +or stability of the code, it does indicate that the project has yet to be +fully endorsed by the ASF. \ No newline at end of file diff --git a/fluss-rust/LICENSE b/fluss-rust/LICENSE new file mode 100644 index 0000000000..261eeb9e9f --- /dev/null +++ b/fluss-rust/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/fluss-rust/NOTICE b/fluss-rust/NOTICE new file mode 100644 index 0000000000..1ec5da05df --- /dev/null +++ b/fluss-rust/NOTICE @@ -0,0 +1,5 @@ +Apache Fluss Rust (incubating) +Copyright 2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). \ No newline at end of file diff --git a/fluss-rust/README.md b/fluss-rust/README.md new file mode 100644 index 0000000000..6caaebb154 --- /dev/null +++ b/fluss-rust/README.md @@ -0,0 +1,138 @@ + + +# Apache Fluss™ Rust (Incubating) + +![Experimental](https://img.shields.io/badge/status-experimental-orange) + +Rust implementation of [Apache Fluss™](https://fluss.apache.org/). + + +## Why Fluss? +[Fluss](https://fluss.apache.org/) is a streaming storage built for real-time analytics which can serve as the real-time data layer for Lakehouse architectures. +It bridges the gap between streaming data and the data Lakehouse by enabling low-latency, high-throughput data ingestion and processing while seamlessly integrating with popular compute engines. + +## Why Fluss Rust Client +It's an unofficial experimental Rust client for interacting with Fluss. This client provides foundational capabilities for table management and log streaming operations, enabling developers to explore Fluss within Rust ecosystems. + +## Quick-Start + +### Step1 Start Fluss cluster +#### Requirements +Fluss runs on all UNIX-like environments, e.g. Linux, Mac OS X. Before you start to setup the system, make sure you have the following software installed on your test machine: + +Java 17 or higher (Java 8 and Java 11 are not recommended) +If your cluster does not fulfill these software requirements you will need to install/upgrade it. + +Fluss requires the JAVA_HOME environment variable to be set on all nodes and point to the directory of your Java installation. + +#### Fluss Setup +Go to the [downloads](https://fluss.apache.org/downloads/) page and download the Fluss-0.6.0. Make sure to pick the Fluss package matching your Java version. After downloading the latest release, extract it: +```shell +tar -xzf fluss-0.7-SNAPSHOT-bin.tgz +cd fluss-0.7-SNAPSHOT/ +``` +You can start Fluss local cluster by running the following command: +```shell +./bin/local-cluster.sh start +``` +After that, the Fluss local cluster is started. + +### Run Provided Example +Only supports Linux or macOs. You will need to [install Rust](https://www.rust-lang.org/tools/install) firstly. + +After that, go the project directory, build it and run the example: +```shell +cargo build --example example-table --release +cd target/release/examples +./example-table +``` +The example code is as follows: +```rust +#[tokio::main] +pub async fn main() -> Result<()> { + // 1: create the table; + let mut args = Args::default(); + args.bootstrap_server = "127.0.0.1:9123".to_string(); + let conn_config = ConnectionConfig::from_args(args); + let conn = FlussConnection::new(conn_config).await; + + let admin = conn.get_admin(); + + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("c1", DataTypes::int()) + .column("c2", DataTypes::string()) + .build(), + ) + .build(); + + let table_path = TablePath::new("fluss".to_owned(), "rust_test".to_owned()); + + admin + .create_table(&table_path, &table_descriptor, true) + .await + .unwrap(); + + // 2: get the table + let table_info = admin.get_table(&table_path).await.unwrap(); + print!("Get created table:\n {}\n", table_info); + + // let's sleep 2 seconds to wait leader ready + thread::sleep(Duration::from_secs(2)); + + // 3: append log to the table + let table = conn.get_table(&table_path).await; + let append_writer = table.new_append().create_writer(); + let batch = record_batch!(("c1", Int32, [1, 2, 3, 4, 5, 6]), ("c2", Utf8, ["a1", "a2", "a3", "a4", "a5", "a6"])).unwrap(); + append_writer.append(batch).await?; + println!("Start to scan log records......"); + // 4: scan the records + let log_scanner = table.new_scan().create_log_scanner(); + log_scanner.subscribe(0, 0).await; + + loop { + let scan_records = log_scanner.poll(Duration::from_secs(10)).await?; + println!("Start to poll records......"); + for record in scan_records { + let row = record.row(); + println!( + "{{{}, {}}}@{}", + row.get_int(0), + row.get_string(1), + record.offset() + ); + } + } + Ok(()) +} +``` + +You can change it according to your needs, have fun! + +#### Clear environment +Then, stop your Fluss cluster. Go to your Fluss home, stop it via the following commands: +```shell +./bin/local-cluster.sh stop +``` + + +## License + +Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) \ No newline at end of file From 2ab9022c1ec609c195ae28e3fbf852700e66449c Mon Sep 17 00:00:00 2001 From: Jark Wu Date: Mon, 25 Aug 2025 17:32:17 +0800 Subject: [PATCH 002/287] Initial Commit --- fluss-rust/.DS_Store | Bin 6148 -> 0 bytes fluss-rust/.gitignore | 1 + 2 files changed, 1 insertion(+) delete mode 100644 fluss-rust/.DS_Store diff --git a/fluss-rust/.DS_Store b/fluss-rust/.DS_Store deleted file mode 100644 index aa422df10c54b59dcb4bd4dcfc37bcac4de3fc93..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKO>fgc5S>j!VpAdI08%eVmbg|SK&gbdIA4cK1*Euw1E3J&sI_ptRpO9C6v=1! zFbDwAuj%c2 z@1rxljEYIInD-{Lw_H2XI*TvkN&GsUF2}9AeVrGRG@s54Ax)-8dG{vG6TR%|MV`zJ zH?kd`@A>1_-fH!<)7cLW4iDD*!Rpy#j-MQ^*S>e}!NcdH@u%!O*I%rZ3cosIcN|{A zR|J(+eU4^%rt?$uFzYrA=!hcf(=l}sy`UkDjxzl*&dkdB57C9Ue;Px?BOW8OkIa`~ z4lCK;D%lNWy`mG#*vwV;E4e<6FlrZb;(4^`oRSKIS9EnQsEUaKqJSuHZ3W!!>Fr*7 z1R;t7qQFg6fcJ+0&KNo@Et;(Zg}DL%>u5HHm_MDE6F3YVmKM2QQ$@? zpz4EY(8H44*}AnjK5GN`2RIw&l@@ Date: Mon, 25 Aug 2025 17:35:53 +0800 Subject: [PATCH 003/287] [github] Update GitHub issue template --- fluss-rust/.github/ISSUE_TEMPLATE/bug.yml | 4 ++-- fluss-rust/.github/ISSUE_TEMPLATE/feature.yml | 4 ++-- fluss-rust/.github/ISSUE_TEMPLATE/task.yml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fluss-rust/.github/ISSUE_TEMPLATE/bug.yml b/fluss-rust/.github/ISSUE_TEMPLATE/bug.yml index 43fbc90d8b..aadd86da4e 100644 --- a/fluss-rust/.github/ISSUE_TEMPLATE/bug.yml +++ b/fluss-rust/.github/ISSUE_TEMPLATE/bug.yml @@ -28,10 +28,10 @@ body: attributes: label: Search before asking description: > - Please search [issues](https://github.com/apache/fluss-benchmarks/issues) to check if your issue has already been reported. + Please search [issues](https://github.com/apache/fluss-rust/issues) to check if your issue has already been reported. options: - label: > - I searched in the [issues](https://github.com/apache/fluss-benchmarks/issues) and found nothing similar. + I searched in the [issues](https://github.com/apache/fluss-rust/issues) and found nothing similar. required: true - type: textarea attributes: diff --git a/fluss-rust/.github/ISSUE_TEMPLATE/feature.yml b/fluss-rust/.github/ISSUE_TEMPLATE/feature.yml index fc7357165d..9f08a60008 100644 --- a/fluss-rust/.github/ISSUE_TEMPLATE/feature.yml +++ b/fluss-rust/.github/ISSUE_TEMPLATE/feature.yml @@ -28,10 +28,10 @@ body: attributes: label: Search before asking description: > - Please search [issues](https://github.com/apache/fluss-benchmarks/issues) to check if your issue has already been reported. + Please search [issues](https://github.com/apache/fluss-rust/issues) to check if your issue has already been reported. options: - label: > - I searched in the [issues](https://github.com/apache/fluss-benchmarks/issues) and found nothing similar. + I searched in the [issues](https://github.com/apache/fluss-rust/issues) and found nothing similar. required: true - type: textarea attributes: diff --git a/fluss-rust/.github/ISSUE_TEMPLATE/task.yml b/fluss-rust/.github/ISSUE_TEMPLATE/task.yml index 70b5369a45..dddc621f28 100644 --- a/fluss-rust/.github/ISSUE_TEMPLATE/task.yml +++ b/fluss-rust/.github/ISSUE_TEMPLATE/task.yml @@ -28,10 +28,10 @@ body: attributes: label: Search before asking description: > - Please search [issues](https://github.com/apache/fluss-benchmarks/issues) to check if your issue has already been reported. + Please search [issues](https://github.com/apache/fluss-rust/issues) to check if your issue has already been reported. options: - label: > - I searched in the [issues](https://github.com/apache/fluss-benchmarks/issues) and found nothing similar. + I searched in the [issues](https://github.com/apache/fluss-rust/issues) and found nothing similar. required: true - type: textarea attributes: From f83b32ca6b89c721735c133a7041cb88b2f48ac1 Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Tue, 26 Aug 2025 20:51:06 +0800 Subject: [PATCH 004/287] [feat] Support basic write & read (#2) --- fluss-rust/.github/dependabot.yml | 30 + fluss-rust/.github/workflows/ci.yml | 94 ++ fluss-rust/Cargo.toml | 36 + fluss-rust/copyright.txt | 17 + fluss-rust/crates/examples/Cargo.toml | 34 + .../crates/examples/src/example_table.rs | 86 ++ fluss-rust/crates/fluss/Cargo.toml | 55 ++ fluss-rust/crates/fluss/src/build.rs | 23 + fluss-rust/crates/fluss/src/client/admin.rs | 93 ++ .../crates/fluss/src/client/connection.rs | 82 ++ .../crates/fluss/src/client/metadata.rs | 109 +++ fluss-rust/crates/fluss/src/client/mod.rs | 26 + .../crates/fluss/src/client/table/append.rs | 69 ++ .../crates/fluss/src/client/table/mod.rs | 73 ++ .../crates/fluss/src/client/table/scanner.rs | 370 +++++++ .../crates/fluss/src/client/table/writer.rs | 88 ++ .../fluss/src/client/write/accumulator.rs | 442 +++++++++ .../crates/fluss/src/client/write/batch.rs | 176 ++++ .../fluss/src/client/write/broadcast.rs | 119 +++ .../fluss/src/client/write/bucket_assigner.rs | 102 ++ .../crates/fluss/src/client/write/mod.rs | 68 ++ .../crates/fluss/src/client/write/sender.rs | 207 ++++ .../fluss/src/client/write/writer_client.rs | 147 +++ .../crates/fluss/src/cluster/cluster.rs | 243 +++++ fluss-rust/crates/fluss/src/cluster/mod.rs | 99 ++ fluss-rust/crates/fluss/src/config.rs | 39 + fluss-rust/crates/fluss/src/error.rs | 50 + fluss-rust/crates/fluss/src/lib.rs | 37 + .../crates/fluss/src/metadata/datatype.rs | 814 ++++++++++++++++ .../crates/fluss/src/metadata/json_serde.rs | 464 +++++++++ fluss-rust/crates/fluss/src/metadata/mod.rs | 24 + fluss-rust/crates/fluss/src/metadata/table.rs | 920 ++++++++++++++++++ .../crates/fluss/src/proto/fluss_api.proto | 197 ++++ fluss-rust/crates/fluss/src/record/arrow.rs | 545 +++++++++++ fluss-rust/crates/fluss/src/record/error.rs | 27 + fluss-rust/crates/fluss/src/record/mod.rs | 174 ++++ fluss-rust/crates/fluss/src/row/column.rs | 169 ++++ fluss-rust/crates/fluss/src/row/datum.rs | 287 ++++++ fluss-rust/crates/fluss/src/row/mod.rs | 148 +++ fluss-rust/crates/fluss/src/rpc/api_key.rs | 54 + .../crates/fluss/src/rpc/api_version.rs | 54 + fluss-rust/crates/fluss/src/rpc/convert.rs | 43 + fluss-rust/crates/fluss/src/rpc/error.rs | 50 + fluss-rust/crates/fluss/src/rpc/frame.rs | 106 ++ .../fluss/src/rpc/message/create_table.rs | 62 ++ .../crates/fluss/src/rpc/message/fetch.rs | 56 ++ .../crates/fluss/src/rpc/message/get_table.rs | 54 + .../crates/fluss/src/rpc/message/header.rs | 73 ++ .../crates/fluss/src/rpc/message/mod.rs | 97 ++ .../fluss/src/rpc/message/produce_log.rs | 71 ++ .../fluss/src/rpc/message/update_metadata.rs | 60 ++ fluss-rust/crates/fluss/src/rpc/mod.rs | 31 + .../crates/fluss/src/rpc/server_connection.rs | 402 ++++++++ fluss-rust/crates/fluss/src/rpc/transport.rs | 83 ++ fluss-rust/crates/fluss/src/util/mod.rs | 176 ++++ .../fluss/tests/integration/client/mod.rs | 21 + fluss-rust/crates/fluss/tests/test_fluss.rs | 25 + fluss-rust/rust-toolchain.toml | 20 + fluss-rust/rustfmt.toml | 19 + 59 files changed, 8340 insertions(+) create mode 100644 fluss-rust/.github/dependabot.yml create mode 100644 fluss-rust/.github/workflows/ci.yml create mode 100644 fluss-rust/Cargo.toml create mode 100644 fluss-rust/copyright.txt create mode 100644 fluss-rust/crates/examples/Cargo.toml create mode 100644 fluss-rust/crates/examples/src/example_table.rs create mode 100644 fluss-rust/crates/fluss/Cargo.toml create mode 100644 fluss-rust/crates/fluss/src/build.rs create mode 100644 fluss-rust/crates/fluss/src/client/admin.rs create mode 100644 fluss-rust/crates/fluss/src/client/connection.rs create mode 100644 fluss-rust/crates/fluss/src/client/metadata.rs create mode 100644 fluss-rust/crates/fluss/src/client/mod.rs create mode 100644 fluss-rust/crates/fluss/src/client/table/append.rs create mode 100644 fluss-rust/crates/fluss/src/client/table/mod.rs create mode 100644 fluss-rust/crates/fluss/src/client/table/scanner.rs create mode 100644 fluss-rust/crates/fluss/src/client/table/writer.rs create mode 100644 fluss-rust/crates/fluss/src/client/write/accumulator.rs create mode 100644 fluss-rust/crates/fluss/src/client/write/batch.rs create mode 100644 fluss-rust/crates/fluss/src/client/write/broadcast.rs create mode 100644 fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs create mode 100644 fluss-rust/crates/fluss/src/client/write/mod.rs create mode 100644 fluss-rust/crates/fluss/src/client/write/sender.rs create mode 100644 fluss-rust/crates/fluss/src/client/write/writer_client.rs create mode 100644 fluss-rust/crates/fluss/src/cluster/cluster.rs create mode 100644 fluss-rust/crates/fluss/src/cluster/mod.rs create mode 100644 fluss-rust/crates/fluss/src/config.rs create mode 100644 fluss-rust/crates/fluss/src/error.rs create mode 100644 fluss-rust/crates/fluss/src/lib.rs create mode 100644 fluss-rust/crates/fluss/src/metadata/datatype.rs create mode 100644 fluss-rust/crates/fluss/src/metadata/json_serde.rs create mode 100644 fluss-rust/crates/fluss/src/metadata/mod.rs create mode 100644 fluss-rust/crates/fluss/src/metadata/table.rs create mode 100644 fluss-rust/crates/fluss/src/proto/fluss_api.proto create mode 100644 fluss-rust/crates/fluss/src/record/arrow.rs create mode 100644 fluss-rust/crates/fluss/src/record/error.rs create mode 100644 fluss-rust/crates/fluss/src/record/mod.rs create mode 100644 fluss-rust/crates/fluss/src/row/column.rs create mode 100644 fluss-rust/crates/fluss/src/row/datum.rs create mode 100644 fluss-rust/crates/fluss/src/row/mod.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/api_key.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/api_version.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/convert.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/error.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/frame.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/create_table.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/fetch.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/get_table.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/header.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/mod.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/produce_log.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/update_metadata.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/mod.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/server_connection.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/transport.rs create mode 100644 fluss-rust/crates/fluss/src/util/mod.rs create mode 100644 fluss-rust/crates/fluss/tests/integration/client/mod.rs create mode 100644 fluss-rust/crates/fluss/tests/test_fluss.rs create mode 100644 fluss-rust/rust-toolchain.toml create mode 100644 fluss-rust/rustfmt.toml diff --git a/fluss-rust/.github/dependabot.yml b/fluss-rust/.github/dependabot.yml new file mode 100644 index 0000000000..7c12d72c0c --- /dev/null +++ b/fluss-rust/.github/dependabot.yml @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +version: 2 +updates: + # Maintain dependencies for GitHub Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + + # Maintain dependencies for rust + - package-ecosystem: "cargo" + directory: "/" + schedule: + interval: "monthly" \ No newline at end of file diff --git a/fluss-rust/.github/workflows/ci.yml b/fluss-rust/.github/workflows/ci.yml new file mode 100644 index 0000000000..26616292b5 --- /dev/null +++ b/fluss-rust/.github/workflows/ci.yml @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: CI + +on: + push: + branches: + - main + pull_request: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} + cancel-in-progress: true + +jobs: + check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Check License Header + uses: apache/skywalking-eyes/header@v0.6.0 + + - name: Install protoc + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Format + run: cargo fmt --all -- --check + + - name: Clippy + run: cargo clippy --all-targets --workspace -- -D warnings + + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: + - ubuntu-latest + - macos-latest + steps: + - uses: actions/checkout@v4 + - name: Install protoc + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get update && sudo apt-get install -y protobuf-compiler + elif [ "$RUNNER_OS" == "macOS" ]; then + brew install protobuf + fi + - name: Build + run: cargo build + + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: + - ubuntu-latest + - macos-latest + steps: + - uses: actions/checkout@v4 + - name: Install protoc + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get update && sudo apt-get install -y protobuf-compiler + elif [ "$RUNNER_OS" == "macOS" ]; then + brew install protobuf + fi + - name: Unit Test + run: cargo test --all-targets --workspace + env: + RUST_LOG: DEBUG + RUST_BACKTRACE: full + - name: Integration Test + run: cargo test --features integration_tests --all-targets --workspace + env: + RUST_LOG: DEBUG + RUST_BACKTRACE: full \ No newline at end of file diff --git a/fluss-rust/Cargo.toml b/fluss-rust/Cargo.toml new file mode 100644 index 0000000000..059236fccf --- /dev/null +++ b/fluss-rust/Cargo.toml @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[workspace.package] +categories = ["command-line-utilities"] +description = "The rust implementation of fluss" +repository = "https://github.com/apache/fluss-rust" +name = "fluss" +edition = "2024" +version = "0.1.0" +license = "Apache-2.0" +rust-version = "1.85" + + +[workspace] +resolver = "2" +members = ["crates/fluss", "crates/examples"] + +[workspace.dependencies] +fluss = { version = "0.1.0", path = "./crates/fluss" } +tokio = { version = "1.44.2", features = ["full"] } +clap = { version = "4.5.37", features = ["derive"] } \ No newline at end of file diff --git a/fluss-rust/copyright.txt b/fluss-rust/copyright.txt new file mode 100644 index 0000000000..d5519133ed --- /dev/null +++ b/fluss-rust/copyright.txt @@ -0,0 +1,17 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ \ No newline at end of file diff --git a/fluss-rust/crates/examples/Cargo.toml b/fluss-rust/crates/examples/Cargo.toml new file mode 100644 index 0000000000..82d864f818 --- /dev/null +++ b/fluss-rust/crates/examples/Cargo.toml @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +edition = { workspace = true } +license = { workspace = true } +name = "fluss-examples" +rust-version = { workspace = true } +version = { workspace = true } + + +[dependencies] +fluss = { workspace = true } +tokio = { workspace = true } +clap = { workspace = true} + + +[[example]] +name = "example-table" +path = "src/example_table.rs" \ No newline at end of file diff --git a/fluss-rust/crates/examples/src/example_table.rs b/fluss-rust/crates/examples/src/example_table.rs new file mode 100644 index 0000000000..3eb8dd867f --- /dev/null +++ b/fluss-rust/crates/examples/src/example_table.rs @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use clap::Parser; +use fluss::client::FlussConnection; +use fluss::config::Config; +use fluss::error::Result; +use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; +use fluss::row::{GenericRow, InternalRow}; +use std::time::Duration; +use tokio::try_join; + +#[tokio::main] +pub async fn main() -> Result<()> { + let mut config = Config::parse(); + config.bootstrap_server = Some("127.0.0.1:56405".to_string()); + + let conn = FlussConnection::new(config).await?; + + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("c1", DataTypes::int()) + .column("c2", DataTypes::string()) + .build()?, + ) + .build()?; + + let table_path = TablePath::new("fluss".to_owned(), "rust_test".to_owned()); + + let admin = conn.get_admin().await?; + + admin + .create_table(&table_path, &table_descriptor, true) + .await?; + + // 2: get the table + let table_info = admin.get_table(&table_path).await?; + print!("Get created table:\n {table_info}\n"); + + // write row + let mut row = GenericRow::new(); + row.set_field(0, 22222); + row.set_field(1, "t2t"); + + let table = conn.get_table(&table_path).await?; + let append_writer = table.new_append()?.create_writer(); + let f1 = append_writer.append(row); + row = GenericRow::new(); + row.set_field(0, 233333); + row.set_field(1, "tt44"); + let f2 = append_writer.append(row); + try_join!(f1, f2, append_writer.flush())?; + + // scan rows + let log_scanner = table.new_scan().create_log_scanner(); + log_scanner.subscribe(0, 0).await?; + + loop { + let scan_records = log_scanner.poll(Duration::from_secs(10)).await?; + println!("Start to poll records......"); + for record in scan_records { + let row = record.row(); + println!( + "{{{}, {}}}@{}", + row.get_int(0), + row.get_string(1), + record.offset() + ); + } + } +} diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml new file mode 100644 index 0000000000..cc26014a46 --- /dev/null +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +edition = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } +name = "fluss" +build = "src/build.rs" + +[dependencies] +arrow = "55.1.0" +arrow-schema = "55.1.0" +byteorder = "1.5" +futures = "0.3" +clap = { workspace = true } +crc32c = "0.6.8" +linked-hash-map = "0.5.6" +prost = "0.13.5" +rand = "0.9.1" +serde = { version = "1.0.219", features = ["derive", "rc"] } +serde_json = "1.0.140" +thiserror = "1.0" +tracing = "0.1" +tokio = { workspace = true } +parking_lot = "0.12" +bytes = "1.10.1" +dashmap = "6.1.0" +rust_decimal = "1" +ordered-float = { version = "4", features = ["serde"] } +parse-display = "0.10" +ref-cast = "1.0" +chrono = { version = "0.4", features = ["clock", "std", "wasmbind"] } + + +[features] +integration_tests = [] + + +[build-dependencies] +prost-build = { version = "0.13.5" } \ No newline at end of file diff --git a/fluss-rust/crates/fluss/src/build.rs b/fluss-rust/crates/fluss/src/build.rs new file mode 100644 index 0000000000..a83cd056b5 --- /dev/null +++ b/fluss-rust/crates/fluss/src/build.rs @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::io::Result; + +fn main() -> Result<()> { + prost_build::compile_protos(&["src/proto/fluss_api.proto"], &["src/proto"])?; + Ok(()) +} diff --git a/fluss-rust/crates/fluss/src/client/admin.rs b/fluss-rust/crates/fluss/src/client/admin.rs new file mode 100644 index 0000000000..8688a2d844 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/admin.rs @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::metadata::Metadata; +use crate::metadata::{JsonSerde, TableDescriptor, TableInfo, TablePath}; +use crate::rpc::message::{CreateTableRequest, GetTableRequest}; +use crate::rpc::{RpcClient, ServerConnection}; +use std::sync::Arc; + +use crate::error::Result; +use crate::proto::GetTableInfoResponse; + +#[allow(dead_code)] +pub struct FlussAdmin { + admin_gateway: ServerConnection, + metadata: Arc, + rpc_client: Arc, +} + +impl FlussAdmin { + pub async fn new(connections: Arc, metadata: Arc) -> Result { + let admin_con = connections + .get_connection( + metadata + .get_cluster() + .get_coordinator_server() + .expect("Couldn't coordinator server"), + ) + .await?; + + Ok(FlussAdmin { + admin_gateway: admin_con, + metadata, + rpc_client: connections, + }) + } + + pub async fn create_table( + &self, + table_path: &TablePath, + table_descriptor: &TableDescriptor, + ignore_if_exists: bool, + ) -> Result<()> { + let _response = self + .admin_gateway + .request(CreateTableRequest::new( + table_path, + table_descriptor, + ignore_if_exists, + )?) + .await?; + Ok(()) + } + + pub async fn get_table(&self, table_path: &TablePath) -> Result { + let response = self + .admin_gateway + .request(GetTableRequest::new(table_path)) + .await?; + let GetTableInfoResponse { + table_id, + schema_id, + table_json, + created_time, + modified_time, + } = response; + let v: &[u8] = &table_json[..]; + let table_descriptor = + TableDescriptor::deserialize_json(&serde_json::from_slice(v).unwrap())?; + Ok(TableInfo::of( + table_path.clone(), + table_id, + schema_id, + table_descriptor, + created_time, + modified_time, + )) + } +} diff --git a/fluss-rust/crates/fluss/src/client/connection.rs b/fluss-rust/crates/fluss/src/client/connection.rs new file mode 100644 index 0000000000..899ad597c1 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/connection.rs @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::WriterClient; +use crate::client::admin::FlussAdmin; +use crate::client::metadata::Metadata; +use crate::client::table::FlussTable; +use crate::config::Config; +use crate::rpc::RpcClient; +use parking_lot::RwLock; +use std::sync::Arc; + +use crate::error::Result; +use crate::metadata::TablePath; + +pub struct FlussConnection { + metadata: Arc, + network_connects: Arc, + args: Config, + writer_client: RwLock>>, +} + +impl FlussConnection { + pub async fn new(arg: Config) -> Result { + let connections = Arc::new(RpcClient::new()); + let metadata = Metadata::new( + arg.bootstrap_server.as_ref().unwrap().as_str(), + connections.clone(), + ) + .await?; + + Ok(FlussConnection { + metadata: Arc::new(metadata), + network_connects: connections.clone(), + args: arg.clone(), + writer_client: Default::default(), + }) + } + + pub fn get_metadata(&self) -> Arc { + self.metadata.clone() + } + + pub fn get_connections(&self) -> Arc { + self.network_connects.clone() + } + + pub async fn get_admin(&self) -> Result { + FlussAdmin::new(self.network_connects.clone(), self.metadata.clone()).await + } + + pub fn get_or_create_writer_client(&self) -> Result> { + if let Some(client) = self.writer_client.read().as_ref() { + return Ok(client.clone()); + } + + // If not exists, create new one + let client = Arc::new(WriterClient::new(self.args.clone(), self.metadata.clone())?); + *self.writer_client.write() = Some(client.clone()); + Ok(client) + } + + pub async fn get_table(&self, table_path: &TablePath) -> Result> { + self.metadata.update_table_metadata(table_path).await?; + let table_info = self.metadata.get_cluster().get_table(table_path).clone(); + Ok(FlussTable::new(self, self.metadata.clone(), table_info)) + } +} diff --git a/fluss-rust/crates/fluss/src/client/metadata.rs b/fluss-rust/crates/fluss/src/client/metadata.rs new file mode 100644 index 0000000000..ebfb959f65 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/metadata.rs @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::cluster::{Cluster, ServerNode, ServerType}; +use crate::metadata::{TableBucket, TablePath}; +use crate::rpc::{RpcClient, ServerConnection, UpdateMetadataRequest}; +use parking_lot::RwLock; +use std::collections::HashSet; +use std::net::SocketAddr; +use std::sync::Arc; + +use crate::error::Result; +use crate::proto::MetadataResponse; + +#[derive(Default)] +pub struct Metadata { + cluster: RwLock>, + connections: Arc, +} + +impl Metadata { + pub async fn new(boot_strap: &str, connections: Arc) -> Result { + let custer = Self::init_cluster(boot_strap, connections.clone()).await?; + Ok(Metadata { + cluster: RwLock::new(Arc::new(custer)), + connections, + }) + } + + async fn init_cluster(boot_strap: &str, connections: Arc) -> Result { + let socker_addrss = boot_strap.parse::().unwrap(); + let server_node = ServerNode::new( + -1, + socker_addrss.ip().to_string(), + socker_addrss.port() as u32, + ServerType::CoordinatorServer, + ); + let con = connections.get_connection(&server_node).await?; + let response = con.request(UpdateMetadataRequest::new(&[])).await?; + Cluster::from_metadata_response(response, None) + } + + pub async fn update(&self, metadata_response: MetadataResponse) -> Result<()> { + let origin_cluster = self.cluster.read().clone(); + let new_cluster = + Cluster::from_metadata_response(metadata_response, Some(&origin_cluster))?; + let mut cluster = self.cluster.write(); + *cluster = Arc::new(new_cluster); + Ok(()) + } + + pub async fn update_tables_metadata(&self, table_paths: &HashSet<&TablePath>) -> Result<()> { + let server = self.cluster.read().get_one_available_server().clone(); + let conn = self.connections.get_connection(&server).await?; + + let update_table_paths: Vec<&TablePath> = table_paths.iter().copied().collect(); + let response = conn + .request(UpdateMetadataRequest::new(update_table_paths.as_slice())) + .await?; + self.update(response).await?; + Ok(()) + } + + pub async fn update_table_metadata(&self, table_path: &TablePath) -> Result<()> { + self.update_tables_metadata(&HashSet::from([table_path])) + .await + } + + pub async fn check_and_update_table_metadata(&self, table_paths: &[TablePath]) -> Result<()> { + let cluster_binding = self.cluster.read().clone(); + let need_update_table_paths: HashSet<&TablePath> = table_paths + .iter() + .filter(|table_path| cluster_binding.opt_get_table(table_path).is_none()) + .collect(); + if !need_update_table_paths.is_empty() { + self.update_tables_metadata(&need_update_table_paths) + .await?; + } + Ok(()) + } + + pub async fn get_connection(&self, server_node: &ServerNode) -> Result { + let result = self.connections.get_connection(server_node).await?; + Ok(result) + } + + pub fn get_cluster(&self) -> Arc { + let guard = self.cluster.read(); + guard.clone() + } + + pub fn leader_for(&self, _table_bucket: &TableBucket) -> Option<&ServerNode> { + todo!() + } +} diff --git a/fluss-rust/crates/fluss/src/client/mod.rs b/fluss-rust/crates/fluss/src/client/mod.rs new file mode 100644 index 0000000000..5b6908eec1 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/mod.rs @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod admin; +mod connection; +mod table; +mod write; + +pub use connection::*; +mod metadata; + +pub use write::*; diff --git a/fluss-rust/crates/fluss/src/client/table/append.rs b/fluss-rust/crates/fluss/src/client/table/append.rs new file mode 100644 index 0000000000..bf15266706 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/append.rs @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::{WriteRecord, WriterClient}; +use crate::metadata::{TableInfo, TablePath}; +use crate::row::GenericRow; +use std::sync::Arc; + +use crate::error::Result; + +#[allow(dead_code)] +pub struct TableAppend { + table_path: TablePath, + table_info: TableInfo, + writer_client: Arc, +} + +impl TableAppend { + pub(super) fn new( + table_path: TablePath, + table_info: TableInfo, + writer_client: Arc, + ) -> Self { + Self { + table_path, + table_info, + writer_client, + } + } + + pub fn create_writer(&self) -> AppendWriter { + AppendWriter { + table_path: Arc::new(self.table_path.clone()), + writer_client: self.writer_client.clone(), + } + } +} + +pub struct AppendWriter { + table_path: Arc, + writer_client: Arc, +} + +impl AppendWriter { + pub async fn append(&self, row: GenericRow<'_>) -> Result<()> { + let record = WriteRecord::new(self.table_path.clone(), row); + let result_handle = self.writer_client.send(&record).await?; + let result = result_handle.wait().await?; + result_handle.result(result) + } + + pub async fn flush(&self) -> Result<()> { + self.writer_client.flush().await + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/mod.rs b/fluss-rust/crates/fluss/src/client/table/mod.rs new file mode 100644 index 0000000000..503a1edb39 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/mod.rs @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::connection::FlussConnection; +use crate::client::metadata::Metadata; +use crate::client::table::append::TableAppend; +use crate::client::table::scanner::TableScan; +use crate::metadata::{TableInfo, TablePath}; +use std::sync::Arc; + +use crate::error::Result; + +mod append; + +mod scanner; +mod writer; + +#[allow(dead_code)] +pub struct FlussTable<'a> { + conn: &'a FlussConnection, + metadata: Arc, + table_info: TableInfo, + table_path: TablePath, + has_primary_key: bool, +} + +impl<'a> FlussTable<'a> { + pub fn new(conn: &'a FlussConnection, metadata: Arc, table_info: TableInfo) -> Self { + FlussTable { + conn, + table_path: table_info.table_path.clone(), + has_primary_key: table_info.has_primary_key(), + table_info, + metadata, + } + } + + pub fn get_table_info(&self) -> &TableInfo { + &self.table_info + } + + pub fn new_append(&self) -> Result { + Ok(TableAppend::new( + self.table_path.clone(), + self.table_info.clone(), + self.conn.get_or_create_writer_client()?, + )) + } + + pub fn new_scan(&self) -> TableScan<'_> { + TableScan::new(self.conn, self.table_info.clone(), self.metadata.clone()) + } +} + +impl<'a> Drop for FlussTable<'a> { + fn drop(&mut self) { + // do-nothing now + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs new file mode 100644 index 0000000000..41fb17e8c8 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -0,0 +1,370 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::connection::FlussConnection; +use crate::client::metadata::Metadata; +use crate::error::Result; +use crate::metadata::{TableBucket, TableInfo, TablePath}; +use crate::proto::{FetchLogRequest, PbFetchLogReqForBucket, PbFetchLogReqForTable}; +use crate::record::{LogRecordsBatchs, ReadContext, ScanRecord, ScanRecords, to_arrow_schema}; +use crate::rpc::RpcClient; +use crate::util::FairBucketStatusMap; +use parking_lot::RwLock; +use std::collections::HashMap; +use std::slice::from_ref; +use std::sync::Arc; +use std::time::Duration; + +const LOG_FETCH_MAX_BYTES: i32 = 16 * 1024 * 1024; +#[allow(dead_code)] +const LOG_FETCH_MAX_BYTES_FOR_BUCKET: i32 = 1024; +const LOG_FETCH_MIN_BYTES: i32 = 1; +const LOG_FETCH_WAIT_MAX_TIME: i32 = 500; + +pub struct TableScan<'a> { + conn: &'a FlussConnection, + table_info: TableInfo, + metadata: Arc, +} + +impl<'a> TableScan<'a> { + pub fn new(conn: &'a FlussConnection, table_info: TableInfo, metadata: Arc) -> Self { + Self { + conn, + table_info, + metadata, + } + } + + pub fn create_log_scanner(&self) -> LogScanner { + LogScanner::new( + &self.table_info, + self.metadata.clone(), + self.conn.get_connections(), + ) + } +} + +pub struct LogScanner { + table_path: TablePath, + table_id: i64, + metadata: Arc, + log_scanner_status: Arc, + log_fetcher: LogFetcher, +} + +impl LogScanner { + pub fn new( + table_info: &TableInfo, + metadata: Arc, + connections: Arc, + ) -> Self { + let log_scanner_status = Arc::new(LogScannerStatus::new()); + Self { + table_path: table_info.table_path.clone(), + table_id: table_info.table_id, + metadata: metadata.clone(), + log_scanner_status: log_scanner_status.clone(), + log_fetcher: LogFetcher::new( + table_info.clone(), + connections, + metadata.clone(), + log_scanner_status.clone(), + ), + } + } + + pub async fn poll(&self, _timeout: Duration) -> Result { + Ok(ScanRecords::new(self.poll_for_fetches().await?)) + } + + pub async fn subscribe(&self, bucket: i32, offset: i64) -> Result<()> { + let table_bucket = TableBucket::new(self.table_id, bucket); + self.metadata + .check_and_update_table_metadata(from_ref(&self.table_path)) + .await?; + self.log_scanner_status + .assign_scan_bucket(table_bucket, offset); + Ok(()) + } + + async fn poll_for_fetches(&self) -> Result>> { + self.log_fetcher.send_fetches_and_collect().await + } +} + +#[allow(dead_code)] +struct LogFetcher { + table_path: TablePath, + conns: Arc, + table_info: TableInfo, + metadata: Arc, + log_scanner_status: Arc, +} + +impl LogFetcher { + pub fn new( + table_info: TableInfo, + conns: Arc, + metadata: Arc, + log_scanner_status: Arc, + ) -> Self { + LogFetcher { + table_path: table_info.table_path.clone(), + conns: conns.clone(), + table_info: table_info.clone(), + metadata: metadata.clone(), + log_scanner_status: log_scanner_status.clone(), + } + } + + async fn send_fetches_and_collect(&self) -> Result>> { + let fetch_request = self.prepare_fetch_log_requests().await; + let mut result: HashMap> = HashMap::new(); + for (leader, fetch_request) in fetch_request { + let cluster = self.metadata.get_cluster(); + let server_node = cluster + .get_tablet_server(leader) + .expect("todo: handle leader not exist."); + let con = self.conns.get_connection(server_node).await?; + + let fetch_response = con + .request(crate::rpc::message::FetchLogRequest::new(fetch_request)) + .await?; + + for pb_fetch_log_resp in fetch_response.tables_resp { + let table_id = pb_fetch_log_resp.table_id; + let fetch_log_for_buckets = pb_fetch_log_resp.buckets_resp; + let arrow_schema = to_arrow_schema(self.table_info.get_row_type()); + for fetch_log_for_bucket in fetch_log_for_buckets { + let mut fetch_records = vec![]; + let bucket: i32 = fetch_log_for_bucket.bucket_id; + let table_bucket = TableBucket::new(table_id, bucket); + if fetch_log_for_bucket.records.is_some() { + let data = fetch_log_for_bucket.records.unwrap(); + for log_record in &mut LogRecordsBatchs::new(&data) { + let last_offset = log_record.last_log_offset(); + fetch_records + .extend(log_record.records(ReadContext::new(arrow_schema.clone()))); + self.log_scanner_status + .update_offset(&table_bucket, last_offset + 1); + } + } + result.insert(table_bucket, fetch_records); + } + } + } + + Ok(result) + } + + async fn prepare_fetch_log_requests(&self) -> HashMap { + let mut fetch_log_req_for_buckets = HashMap::new(); + let mut table_id = None; + let mut ready_for_fetch_count = 0; + for bucket in self.fetchable_buckets() { + if table_id.is_none() { + table_id = Some(bucket.table_id()); + } + + let offset = match self.log_scanner_status.get_bucket_offset(&bucket) { + Some(offset) => offset, + None => { + // todo: debug + continue; + } + }; + + if let Some(leader) = self.get_table_bucket_leader(&bucket) { + let fetch_log_req_for_bucket = PbFetchLogReqForBucket { + partition_id: None, + bucket_id: bucket.bucket_id(), + fetch_offset: offset, + // 1M + max_fetch_bytes: 1024 * 1024, + }; + + fetch_log_req_for_buckets + .entry(leader) + .or_insert_with(Vec::new) + .push(fetch_log_req_for_bucket); + ready_for_fetch_count += 1; + } + } + + if ready_for_fetch_count == 0 { + HashMap::new() + } else { + fetch_log_req_for_buckets + .into_iter() + .map(|(leader_id, feq_for_buckets)| { + let req_for_table = PbFetchLogReqForTable { + table_id: table_id.unwrap(), + projection_pushdown_enabled: false, + projected_fields: vec![], + buckets_req: feq_for_buckets, + }; + + let fetch_log_request = FetchLogRequest { + follower_server_id: -1, + max_bytes: LOG_FETCH_MAX_BYTES, + tables_req: vec![req_for_table], + max_wait_ms: Some(LOG_FETCH_WAIT_MAX_TIME), + min_bytes: Some(LOG_FETCH_MIN_BYTES), + }; + (leader_id, fetch_log_request) + }) + .collect() + } + } + + fn fetchable_buckets(&self) -> Vec { + // always available now + self.log_scanner_status.fetchable_buckets(|_| true) + } + + fn get_table_bucket_leader(&self, tb: &TableBucket) -> Option { + let cluster = self.metadata.get_cluster(); + cluster.leader_for(tb).map(|leader| leader.id()) + } +} + +pub struct LogScannerStatus { + bucket_status_map: Arc>>, +} + +#[allow(dead_code)] +impl LogScannerStatus { + pub fn new() -> Self { + Self { + bucket_status_map: Arc::new(RwLock::new(FairBucketStatusMap::new())), + } + } + + pub fn prepare_to_poll(&self) -> bool { + let map = self.bucket_status_map.read(); + map.size() > 0 + } + + pub fn move_bucket_to_end(&self, table_bucket: TableBucket) { + let mut map = self.bucket_status_map.write(); + map.move_to_end(table_bucket); + } + + /// Gets the offset of a bucket if it exists + pub fn get_bucket_offset(&self, table_bucket: &TableBucket) -> Option { + let map = self.bucket_status_map.read(); + map.status_value(table_bucket).map(|status| status.offset()) + } + + pub fn update_high_watermark(&self, table_bucket: &TableBucket, high_watermark: i64) { + if let Some(status) = self.get_status(table_bucket) { + status.set_high_watermark(high_watermark); + } + } + + pub fn update_offset(&self, table_bucket: &TableBucket, offset: i64) { + if let Some(status) = self.get_status(table_bucket) { + status.set_offset(offset); + } + } + + pub fn assign_scan_buckets(&self, scan_bucket_offsets: HashMap) { + let mut map = self.bucket_status_map.write(); + for (bucket, offset) in scan_bucket_offsets { + let status = map + .status_value(&bucket) + .cloned() + .unwrap_or_else(|| Arc::new(BucketScanStatus::new(offset))); + status.set_offset(offset); + map.update(bucket, status); + } + } + + pub fn assign_scan_bucket(&self, table_bucket: TableBucket, offset: i64) { + let status = Arc::new(BucketScanStatus::new(offset)); + self.bucket_status_map.write().update(table_bucket, status); + } + + /// Unassigns scan buckets + pub fn unassign_scan_buckets(&self, buckets: &[TableBucket]) { + let mut map = self.bucket_status_map.write(); + for bucket in buckets { + map.remove(bucket); + } + } + + /// Gets fetchable buckets based on availability predicate + pub fn fetchable_buckets(&self, is_available: F) -> Vec + where + F: Fn(&TableBucket) -> bool, + { + let map = self.bucket_status_map.read(); + let mut result = Vec::new(); + map.for_each(|bucket, _| { + if is_available(bucket) { + result.push(bucket.clone()); + } + }); + result + } + + /// Helper to get bucket status + fn get_status(&self, table_bucket: &TableBucket) -> Option> { + let map = self.bucket_status_map.read(); + map.status_value(table_bucket).cloned() + } +} + +impl Default for LogScannerStatus { + fn default() -> Self { + Self::new() + } +} + +#[derive(Debug)] +#[allow(dead_code)] +pub struct BucketScanStatus { + offset: RwLock, + high_watermark: RwLock, +} + +#[allow(dead_code)] +impl BucketScanStatus { + pub fn new(offset: i64) -> Self { + Self { + offset: RwLock::new(offset), + high_watermark: RwLock::new(0), + } + } + + pub fn offset(&self) -> i64 { + *self.offset.read() + } + + pub fn set_offset(&self, offset: i64) { + *self.offset.write() = offset + } + + pub fn high_watermark(&self) -> i64 { + *self.high_watermark.read() + } + + pub fn set_high_watermark(&self, high_watermark: i64) { + *self.high_watermark.write() = high_watermark + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/writer.rs b/fluss-rust/crates/fluss/src/client/table/writer.rs new file mode 100644 index 0000000000..b2ba881b36 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/writer.rs @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::{WriteRecord, WriterClient}; +use crate::row::GenericRow; +use std::sync::Arc; + +use crate::error::Result; +use crate::metadata::{TableInfo, TablePath}; + +#[allow(dead_code)] +pub trait TableWriter { + async fn flush(&self) -> Result<()>; +} + +#[allow(dead_code)] +pub trait AppendWriter: TableWriter { + async fn append(&self, row: GenericRow) -> Result<()>; +} + +#[allow(dead_code)] +pub trait UpsertWriter: TableWriter { + async fn upsert(&self, row: GenericRow) -> Result<()>; + async fn delete(&self, row: GenericRow) -> Result<()>; +} + +#[allow(dead_code)] +pub struct AbstractTableWriter { + table_path: Arc, + writer_client: Arc, + field_count: i32, +} + +#[allow(dead_code)] +impl AbstractTableWriter { + pub fn new( + table_path: TablePath, + table_info: &TableInfo, + writer_client: Arc, + ) -> Self { + // todo: partition + Self { + table_path: Arc::new(table_path), + writer_client, + field_count: table_info.row_type().fields().len() as i32, + } + } + + pub async fn send(&self, write_record: &WriteRecord<'_>) -> Result<()> { + let result_handle = self.writer_client.send(write_record).await?; + let result = result_handle.wait().await?; + result_handle.result(result) + } +} + +impl TableWriter for AbstractTableWriter { + async fn flush(&self) -> Result<()> { + todo!() + } +} + +// Append writer implementation +#[allow(dead_code)] +pub struct AppendWriterImpl { + base: AbstractTableWriter, +} + +#[allow(dead_code)] +impl AppendWriterImpl { + pub async fn append(&self, row: GenericRow<'_>) -> Result<()> { + let record = WriteRecord::new(self.base.table_path.clone(), row); + self.base.send(&record).await + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs new file mode 100644 index 0000000000..0b77894025 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs @@ -0,0 +1,442 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::write::batch::WriteBatch::ArrowLog; +use crate::client::write::batch::{ArrowLogWriteBatch, WriteBatch}; +use crate::client::{ResultHandle, WriteRecord}; +use crate::cluster::{BucketLocation, Cluster, ServerNode}; +use crate::config::Config; +use crate::error::Result; +use crate::metadata::{TableBucket, TablePath}; +use crate::util::current_time_ms; +use crate::{BucketId, PartitionId, TableId}; +use dashmap::DashMap; +use parking_lot::RwLock; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::sync::Arc; +use std::sync::atomic::{AtomicI32, AtomicI64, Ordering}; +use tokio::sync::Mutex; + +#[allow(dead_code)] +pub struct RecordAccumulator { + config: Config, + write_batches: DashMap, + // batch_id -> complete callback + incomplete_batches: RwLock>, + batch_timeout_ms: i64, + closed: bool, + flushes_in_progress: AtomicI32, + appends_in_progress: i32, + nodes_drain_index: Mutex>, + batch_id: AtomicI64, +} + +impl RecordAccumulator { + pub fn new(config: Config) -> Self { + RecordAccumulator { + config, + write_batches: Default::default(), + incomplete_batches: Default::default(), + batch_timeout_ms: 500, + closed: Default::default(), + flushes_in_progress: Default::default(), + appends_in_progress: Default::default(), + nodes_drain_index: Default::default(), + batch_id: Default::default(), + } + } + + fn try_append( + &self, + record: &WriteRecord, + dq: &mut VecDeque, + ) -> Result> { + let dq_size = dq.len(); + if let Some(last_batch) = dq.back_mut() { + return if let Some(result_handle) = last_batch.try_append(record)? { + Ok(Some(RecordAppendResult::new( + result_handle, + dq_size > 1 || last_batch.is_closed(), + false, + false, + ))) + } else { + Ok(None) + }; + } + Ok(None) + } + + fn append_new_batch( + &self, + cluster: &Cluster, + record: &WriteRecord, + bucket_id: BucketId, + dq: &mut VecDeque, + ) -> Result { + if let Some(append_result) = self.try_append(record, dq)? { + return Ok(append_result); + } + + let table_path = &record.table_path; + + let row_type = &cluster.get_table(table_path).row_type; + + let mut batch = ArrowLog(ArrowLogWriteBatch::new( + self.batch_id.fetch_add(1, Ordering::Relaxed), + table_path.as_ref().clone(), + 0, + row_type, + bucket_id, + current_time_ms(), + )); + + let batch_id = batch.batch_id(); + + let result_handle = batch + .try_append(record)? + .expect("must append to a new batch"); + + let batch_is_closed = batch.is_closed(); + dq.push_back(batch); + + self.incomplete_batches + .write() + .insert(batch_id, result_handle.clone()); + Ok(RecordAppendResult::new( + result_handle, + dq.len() > 1 || batch_is_closed, + true, + false, + )) + } + + pub async fn append( + &self, + record: &WriteRecord<'_>, + bucket_id: BucketId, + cluster: &Cluster, + abort_if_batch_full: bool, + ) -> Result { + let table_path = &record.table_path; + let mut binding = self + .write_batches + .entry(table_path.as_ref().clone()) + .or_insert_with(|| BucketAndWriteBatches { + table_id: 0, + is_partitioned_table: false, + partition_id: None, + batches: Default::default(), + }); + let bucket_and_batches = binding.value_mut(); + let dq = bucket_and_batches + .batches + .entry(bucket_id) + .or_insert_with(|| Mutex::new(VecDeque::new())); + let mut dq_guard = dq.lock().await; + if let Some(append_result) = self.try_append(record, &mut dq_guard)? { + return Ok(append_result); + } + + if abort_if_batch_full { + return Ok(RecordAppendResult::new_without_result_handle( + true, false, true, + )); + } + + self.append_new_batch(cluster, record, bucket_id, &mut dq_guard) + } + + pub async fn ready(&self, cluster: &Arc) -> ReadyCheckResult { + let mut ready_nodes = HashSet::new(); + let mut next_ready_check_delay_ms = self.batch_timeout_ms; + let mut unknown_leader_tables = HashSet::new(); + for entry in self.write_batches.iter() { + let table_path = entry.key(); + let batches = entry.value(); + next_ready_check_delay_ms = self + .bucket_ready( + table_path, + batches, + &mut ready_nodes, + &mut unknown_leader_tables, + cluster, + next_ready_check_delay_ms, + ) + .await + } + + ReadyCheckResult { + ready_nodes, + next_ready_check_delay_ms, + unknown_leader_tables, + } + } + + async fn bucket_ready( + &self, + table_path: &TablePath, + batches: &BucketAndWriteBatches, + ready_nodes: &mut HashSet, + unknown_leader_tables: &mut HashSet, + cluster: &Cluster, + next_ready_check_delay_ms: i64, + ) -> i64 { + let mut next_delay = next_ready_check_delay_ms; + + for (bucket_id, batch) in batches.batches.iter() { + let batch_guard = batch.lock().await; + if batch_guard.is_empty() { + continue; + } + + let batch = batch_guard.front().unwrap(); + let waited_time_ms = batch.waited_time_ms(current_time_ms()); + let deque_size = batch_guard.len(); + let full = deque_size > 1 || batch.is_closed(); + let table_bucket = cluster.get_table_bucket(table_path, *bucket_id); + if let Some(leader) = cluster.leader_for(&table_bucket) { + next_delay = + self.batch_ready(leader, waited_time_ms, full, ready_nodes, next_delay); + } else { + unknown_leader_tables.insert(table_path.clone()); + } + } + next_delay + } + + fn batch_ready( + &self, + leader: &ServerNode, + waited_time_ms: i64, + full: bool, + ready_nodes: &mut HashSet, + next_ready_check_delay_ms: i64, + ) -> i64 { + if !ready_nodes.contains(leader) { + let expired = waited_time_ms >= self.batch_timeout_ms; + let sendable = full || expired || self.closed || self.flush_in_progress(); + + if sendable { + ready_nodes.insert(leader.clone()); + } else { + let time_left_ms = self.batch_timeout_ms.saturating_sub(waited_time_ms); + return next_ready_check_delay_ms.min(time_left_ms); + } + } + next_ready_check_delay_ms + } + + pub async fn drain( + &self, + cluster: Arc, + nodes: &HashSet, + max_size: i32, + ) -> Result>>> { + if nodes.is_empty() { + return Ok(HashMap::new()); + } + let mut batches = HashMap::new(); + for node in nodes { + let ready = self + .drain_batches_for_one_node(&cluster, node, max_size) + .await?; + if !ready.is_empty() { + batches.insert(node.id(), ready); + } + } + + Ok(batches) + } + + async fn drain_batches_for_one_node( + &self, + cluster: &Cluster, + node: &ServerNode, + max_size: i32, + ) -> Result>> { + let mut size = 0; + let buckets = self.get_all_buckets_in_current_node(node, cluster); + let mut ready = Vec::new(); + + if buckets.is_empty() { + return Ok(ready); + } + + let mut nodes_drain_index_guard = self.nodes_drain_index.lock().await; + let drain_index = nodes_drain_index_guard.entry(node.id()).or_insert(0); + let start = *drain_index % buckets.len(); + let mut current_index = start; + + loop { + let bucket = &buckets[current_index]; + let table_path = bucket.table_path.clone(); + let table_bucket = bucket.table_bucket.clone(); + nodes_drain_index_guard.insert(node.id(), current_index); + current_index = (current_index + 1) % buckets.len(); + + let bucket_and_write_batches = self.write_batches.get(&table_path); + if let Some(bucket_and_write_batches) = bucket_and_write_batches { + if let Some(deque) = bucket_and_write_batches + .batches + .get(&table_bucket.bucket_id()) + { + let mut batch = { + let mut batch_lock = deque.lock().await; + if batch_lock.is_empty() { + continue; + } + let first_batch = batch_lock.front().unwrap(); + + if size + first_batch.estimated_size_in_bytes() > max_size as i64 + && !ready.is_empty() + { + // there is a rare case that a single batch size is larger than the request size + // due to compression; in this case we will still eventually send this batch in + // a single request. + break; + } + + batch_lock.pop_front().unwrap() + }; + + let current_batch_size = batch.estimated_size_in_bytes(); + size += current_batch_size; + + // mark the batch as drained. + batch.drained(current_time_ms()); + ready.push(Arc::new(ReadyWriteBatch { + table_bucket, + write_batch: batch, + })); + } + } + if current_index == start { + break; + } + } + Ok(ready) + } + + pub fn remove_incomplete_batches(&self, batch_id: i64) { + self.incomplete_batches.write().remove(&batch_id); + } + + fn get_all_buckets_in_current_node( + &self, + current: &ServerNode, + cluster: &Cluster, + ) -> Vec { + let mut buckets = vec![]; + for bucket_locations in cluster.get_bucket_locations_by_path().values() { + for bucket_location in bucket_locations { + if let Some(leader) = bucket_location.leader() { + if current.id() == leader.id() { + buckets.push(bucket_location.clone()); + } + } + } + } + buckets + } + + fn flush_in_progress(&self) -> bool { + self.flushes_in_progress.load(Ordering::SeqCst) > 0 + } + + pub fn begin_flush(&self) { + self.flushes_in_progress.fetch_add(1, Ordering::SeqCst); + } + + #[allow(unused_must_use)] + #[allow(clippy::await_holding_lock)] + pub async fn await_flush_completion(&self) -> Result<()> { + for result_handle in self.incomplete_batches.read().values() { + result_handle.wait().await?; + } + Ok(()) + } +} + +pub struct ReadyWriteBatch { + pub table_bucket: TableBucket, + pub write_batch: WriteBatch, +} + +#[allow(dead_code)] +struct BucketAndWriteBatches { + table_id: TableId, + is_partitioned_table: bool, + partition_id: Option, + batches: HashMap>>, +} + +pub struct RecordAppendResult { + pub batch_is_full: bool, + pub new_batch_created: bool, + pub abort_record_for_new_batch: bool, + pub result_handle: Option, +} + +impl RecordAppendResult { + fn new( + result_handle: ResultHandle, + batch_is_full: bool, + new_batch_created: bool, + abort_record_for_new_batch: bool, + ) -> Self { + Self { + batch_is_full, + new_batch_created, + abort_record_for_new_batch, + result_handle: Some(result_handle), + } + } + + fn new_without_result_handle( + batch_is_full: bool, + new_batch_created: bool, + abort_record_for_new_batch: bool, + ) -> Self { + Self { + batch_is_full, + new_batch_created, + abort_record_for_new_batch, + result_handle: None, + } + } +} + +pub struct ReadyCheckResult { + pub ready_nodes: HashSet, + pub next_ready_check_delay_ms: i64, + pub unknown_leader_tables: HashSet, +} + +impl ReadyCheckResult { + pub fn new( + ready_nodes: HashSet, + next_ready_check_delay_ms: i64, + unknown_leader_tables: HashSet, + ) -> Self { + ReadyCheckResult { + ready_nodes, + next_ready_check_delay_ms, + unknown_leader_tables, + } + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/batch.rs b/fluss-rust/crates/fluss/src/client/write/batch.rs new file mode 100644 index 0000000000..64c5dd6517 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/batch.rs @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::BucketId; +use crate::client::broadcast::{BatchWriteResult, BroadcastOnce}; +use crate::client::{ResultHandle, WriteRecord}; +use crate::metadata::{DataType, TablePath}; +use std::cmp::max; + +use crate::error::Result; +use crate::record::MemoryLogRecordsArrowBuilder; + +#[allow(dead_code)] +pub struct InnerWriteBatch { + batch_id: i64, + table_path: TablePath, + create_ms: i64, + bucket_id: BucketId, + results: BroadcastOnce, + completed: bool, + drained_ms: i64, +} + +impl InnerWriteBatch { + fn new(batch_id: i64, table_path: TablePath, create_ms: i64, bucket_id: BucketId) -> Self { + InnerWriteBatch { + batch_id, + table_path, + create_ms, + bucket_id, + results: Default::default(), + completed: Default::default(), + drained_ms: -1, + } + } + + fn waited_time_ms(&self, now: i64) -> i64 { + max(0i64, now - self.create_ms) + } + + fn complete(&self, write_result: BatchWriteResult) -> bool { + if !self.completed { + self.results.broadcast(write_result); + } + true + } + + fn drained(&mut self, now_ms: i64) { + self.drained_ms = max(self.drained_ms, now_ms); + } +} + +pub enum WriteBatch { + ArrowLog(ArrowLogWriteBatch), +} + +impl WriteBatch { + pub fn inner_batch(&self) -> &InnerWriteBatch { + match self { + WriteBatch::ArrowLog(batch) => &batch.write_batch, + } + } + + pub fn try_append(&mut self, write_record: &WriteRecord) -> Result> { + match self { + WriteBatch::ArrowLog(batch) => batch.try_append(write_record), + } + } + + pub fn waited_time_ms(&self, now: i64) -> i64 { + self.inner_batch().waited_time_ms(now) + } + + pub fn close(&mut self) { + match self { + WriteBatch::ArrowLog(batch) => { + batch.close(); + } + } + } + + pub fn estimated_size_in_bytes(&self) -> i64 { + 0 + // todo: calculate estimated_size_in_bytes + } + + pub fn is_closed(&self) -> bool { + match self { + WriteBatch::ArrowLog(batch) => batch.is_closed(), + } + } + + pub fn drained(&mut self, now_ms: i64) { + match self { + WriteBatch::ArrowLog(batch) => { + batch.write_batch.drained(now_ms); + } + } + } + + pub fn build(&self) -> Result> { + match self { + WriteBatch::ArrowLog(batch) => batch.build(), + } + } + + pub fn complete(&self, write_result: BatchWriteResult) -> bool { + self.inner_batch().complete(write_result) + } + + pub fn batch_id(&self) -> i64 { + self.inner_batch().batch_id + } +} + +pub struct ArrowLogWriteBatch { + pub write_batch: InnerWriteBatch, + pub arrow_builder: MemoryLogRecordsArrowBuilder, +} + +impl ArrowLogWriteBatch { + pub fn new( + batch_id: i64, + table_path: TablePath, + schema_id: i32, + row_type: &DataType, + bucket_id: BucketId, + create_ms: i64, + ) -> Self { + let base = InnerWriteBatch::new(batch_id, table_path, create_ms, bucket_id); + + Self { + write_batch: base, + arrow_builder: MemoryLogRecordsArrowBuilder::new(schema_id, row_type), + } + } + + pub fn batch_id(&self) -> i64 { + self.write_batch.batch_id + } + + pub fn try_append(&mut self, write_record: &WriteRecord) -> Result> { + if self.arrow_builder.is_closed() || self.arrow_builder.is_full() { + Ok(None) + } else { + self.arrow_builder.append(&write_record.row)?; + Ok(Some(ResultHandle::new(self.write_batch.results.receiver()))) + } + } + + pub fn build(&self) -> Result> { + self.arrow_builder.build() + } + + pub fn is_closed(&self) -> bool { + self.arrow_builder.is_closed() + } + + pub fn close(&mut self) { + self.arrow_builder.close() + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/broadcast.rs b/fluss-rust/crates/fluss/src/client/write/broadcast.rs new file mode 100644 index 0000000000..2dcc34cbc4 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/broadcast.rs @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use parking_lot::RwLock; +use std::sync::Arc; +use thiserror::Error; +use tokio::sync::Notify; +use tracing::warn; + +pub type Result = std::result::Result; + +pub type BatchWriteResult = Result<(), Error>; + +#[derive(Debug, Error, Clone, PartialEq, Eq)] +pub enum Error { + #[error("BroadcastOnce dropped")] + Dropped, +} + +#[derive(Debug, Clone)] +pub struct BroadcastOnceReceiver { + shared: Arc>, +} + +impl BroadcastOnceReceiver { + /// Returns `Some(_)` if data has been produced + pub fn peek(&self) -> Option> { + self.shared.data.read().clone() + } + + /// Waits for [`BroadcastOnce::broadcast`] to be called or returns an error + /// if the [`BroadcastOnce`] is dropped without a value being published + pub async fn receive(&self) -> Result { + let notified = self.shared.notify.notified(); + + if let Some(v) = self.peek() { + return v; + } + + notified.await; + + self.peek().expect("just got notified") + } +} + +#[derive(Debug)] +struct Shared { + data: RwLock>>, + notify: Notify, +} + +#[derive(Debug)] +pub struct BroadcastOnce +where + T: Send + Sync, +{ + shared: Arc>, +} + +impl Default for BroadcastOnce +where + T: Send + Sync, +{ + fn default() -> Self { + Self { + shared: Arc::new(Shared { + data: Default::default(), + notify: Default::default(), + }), + } + } +} + +impl BroadcastOnce { + /// Returns a [`BroadcastOnceReceiver`] that can be used to wait on + /// a call to [`BroadcastOnce::broadcast`] on this instance + pub fn receiver(&self) -> BroadcastOnceReceiver { + BroadcastOnceReceiver { + shared: Arc::clone(&self.shared), + } + } + + /// Broadcast a value to all [`BroadcastOnceReceiver`] handles + pub fn broadcast(&self, r: T) { + let mut locked = self.shared.data.write(); + assert!(locked.is_none(), "double publish"); + + *locked = Some(Ok(r)); + self.shared.notify.notify_waiters(); + } +} + +impl Drop for BroadcastOnce +where + T: Send + Sync, +{ + fn drop(&mut self) { + let mut data = self.shared.data.write(); + if data.is_none() { + warn!("BroadcastOnce dropped without producing"); + *data = Some(Err(Error::Dropped)); + self.shared.notify.notify_waiters(); + } + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs b/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs new file mode 100644 index 0000000000..991c5f9197 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::cluster::Cluster; +use crate::metadata::TablePath; +use rand::Rng; +use std::sync::atomic::{AtomicI32, Ordering}; + +pub trait BucketAssigner: Sync + Send { + fn abort_if_batch_full(&self) -> bool; + + fn on_new_batch(&self, cluster: &Cluster, prev_bucket_id: i32); + + fn assign_bucket(&self, bucket_key: Option<&[u8]>, cluster: &Cluster) -> i32; +} + +#[derive(Debug)] +pub struct StickyBucketAssigner { + table_path: TablePath, + current_bucket_id: AtomicI32, +} + +impl StickyBucketAssigner { + pub fn new(table_path: TablePath) -> Self { + Self { + table_path, + current_bucket_id: AtomicI32::new(-1), + } + } + + fn next_bucket(&self, cluster: &Cluster, prev_bucket_id: i32) -> i32 { + let old_bucket = self.current_bucket_id.load(Ordering::Relaxed); + let mut new_bucket = old_bucket; + if old_bucket < 0 || old_bucket == prev_bucket_id { + let available_buckets = cluster.get_available_buckets_for_table_path(&self.table_path); + if available_buckets.is_empty() { + let mut rng = rand::rng(); + let mut random: i32 = rng.random(); + random &= i32::MAX; + new_bucket = random % cluster.get_bucket_count(&self.table_path); + } else if available_buckets.len() == 1 { + new_bucket = available_buckets[0].table_bucket.bucket_id(); + } else { + let mut rng = rand::rng(); + while new_bucket < 0 || new_bucket == old_bucket { + let mut random: i32 = rng.random(); + random &= i32::MAX; + new_bucket = available_buckets + [(random % available_buckets.len() as i32) as usize] + .bucket_id(); + } + } + } + + if old_bucket < 0 { + self.current_bucket_id.store(new_bucket, Ordering::Relaxed); + } else { + self.current_bucket_id + .compare_exchange( + prev_bucket_id, + new_bucket, + Ordering::Relaxed, + Ordering::Relaxed, + ) + .ok(); + } + self.current_bucket_id.load(Ordering::Relaxed) + } +} + +impl BucketAssigner for StickyBucketAssigner { + fn abort_if_batch_full(&self) -> bool { + true + } + + fn on_new_batch(&self, cluster: &Cluster, prev_bucket_id: i32) { + self.next_bucket(cluster, prev_bucket_id); + } + + fn assign_bucket(&self, _bucket_key: Option<&[u8]>, cluster: &Cluster) -> i32 { + let bucket_id = self.current_bucket_id.load(Ordering::Relaxed); + if bucket_id < 0 { + self.next_bucket(cluster, bucket_id) + } else { + bucket_id + } + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/mod.rs b/fluss-rust/crates/fluss/src/client/write/mod.rs new file mode 100644 index 0000000000..74df951115 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/mod.rs @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod accumulator; +mod batch; + +use crate::client::broadcast::{BatchWriteResult, BroadcastOnceReceiver}; +use crate::error::Error; +use crate::metadata::TablePath; +use crate::row::GenericRow; +pub use accumulator::*; +use std::sync::Arc; + +pub(crate) mod broadcast; +mod bucket_assigner; + +mod sender; +mod writer_client; + +pub use writer_client::WriterClient; + +pub struct WriteRecord<'a> { + pub row: GenericRow<'a>, + pub table_path: Arc, +} + +impl<'a> WriteRecord<'a> { + pub fn new(table_path: Arc, row: GenericRow<'a>) -> Self { + Self { row, table_path } + } +} + +#[derive(Debug, Clone)] +pub struct ResultHandle { + receiver: BroadcastOnceReceiver, +} + +impl ResultHandle { + pub fn new(receiver: BroadcastOnceReceiver) -> Self { + ResultHandle { receiver } + } + + pub async fn wait(&self) -> Result { + self.receiver + .receive() + .await + .map_err(|e| Error::WriteError(e.to_string())) + } + + pub fn result(&self, batch_result: BatchWriteResult) -> Result<(), Error> { + // do nothing, just return empty result + batch_result.map_err(|e| Error::WriteError(e.to_string())) + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/sender.rs b/fluss-rust/crates/fluss/src/client/write/sender.rs new file mode 100644 index 0000000000..381e10c5b1 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/sender.rs @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::metadata::Metadata; +use crate::client::{ReadyWriteBatch, RecordAccumulator}; +use crate::error::Error::WriteError; +use crate::error::Result; +use crate::metadata::TableBucket; +use crate::proto::ProduceLogResponse; +use crate::rpc::ProduceLogRequest; +use parking_lot::Mutex; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; + +#[allow(dead_code)] +pub struct Sender { + running: bool, + metadata: Arc, + accumulator: Arc, + in_flight_batches: Mutex>>>, + max_request_size: i32, + ack: i16, + max_request_timeout_ms: i32, + retries: i32, +} + +impl Sender { + pub fn new( + metadata: Arc, + accumulator: Arc, + max_request_size: i32, + max_request_timeout_ms: i32, + ack: i16, + retries: i32, + ) -> Self { + Self { + running: true, + metadata, + accumulator, + in_flight_batches: Default::default(), + max_request_size, + ack, + max_request_timeout_ms, + retries, + } + } + + pub async fn run(&self) -> Result<()> { + loop { + if !self.running { + return Ok(()); + } + self.run_once().await?; + } + } + + async fn run_once(&self) -> Result<()> { + let cluster = self.metadata.get_cluster(); + let ready_check_result = self.accumulator.ready(&cluster).await; + + // Update metadata if needed + if !ready_check_result.unknown_leader_tables.is_empty() { + self.metadata + .update_tables_metadata(&ready_check_result.unknown_leader_tables.iter().collect()) + .await?; + } + + if ready_check_result.ready_nodes.is_empty() { + tokio::time::sleep(Duration::from_millis( + ready_check_result.next_ready_check_delay_ms as u64, + )) + .await; + return Ok(()); + } + + let batches = self + .accumulator + .drain( + cluster.clone(), + &ready_check_result.ready_nodes, + self.max_request_size, + ) + .await?; + + if !batches.is_empty() { + self.add_to_inflight_batches(&batches); + self.send_write_requests(&batches).await?; + } + + Ok(()) + } + + fn add_to_inflight_batches(&self, batches: &HashMap>>) { + let mut in_flight = self.in_flight_batches.lock(); + for batch_list in batches.values() { + for batch in batch_list { + in_flight + .entry(batch.table_bucket.clone()) + .or_default() + .push(batch.clone()); + } + } + } + + async fn send_write_requests( + &self, + collated: &HashMap>>, + ) -> Result<()> { + for (leader_id, batches) in collated { + println!("send request batch"); + self.send_write_request(*leader_id, self.ack, batches) + .await?; + } + Ok(()) + } + + async fn send_write_request( + &self, + destination: i32, + acks: i16, + batches: &Vec>, + ) -> Result<()> { + if batches.is_empty() { + return Ok(()); + } + let mut records_by_bucket = HashMap::new(); + let mut write_batch_by_table = HashMap::new(); + + for batch in batches { + records_by_bucket.insert(batch.table_bucket.clone(), batch.clone()); + write_batch_by_table + .entry(batch.table_bucket.table_id()) + .or_insert_with(Vec::new) + .push(batch); + } + + let cluster = self.metadata.get_cluster(); + + let destination_node = cluster + .get_tablet_server(destination) + .ok_or(WriteError(String::from("destination node not found")))?; + let connection = self.metadata.get_connection(destination_node).await?; + + for (table_id, write_batches) in write_batch_by_table { + let request = + ProduceLogRequest::new(table_id, acks, self.max_request_timeout_ms, write_batches)?; + let response = connection.request(request).await?; + self.handle_produce_response(table_id, &records_by_bucket, response)? + } + + Ok(()) + } + + fn handle_produce_response( + &self, + table_id: i64, + records_by_bucket: &HashMap>, + response: ProduceLogResponse, + ) -> Result<()> { + for produce_log_response_for_bucket in response.buckets_resp.iter() { + let tb = TableBucket::new(table_id, produce_log_response_for_bucket.bucket_id); + + let ready_batch = records_by_bucket.get(&tb).unwrap(); + if let Some(error_code) = produce_log_response_for_bucket.error_code { + todo!("handle_produce_response error: {}", error_code) + } else { + self.complete_batch(ready_batch) + } + } + Ok(()) + } + + fn complete_batch(&self, ready_write_batch: &Arc) { + if ready_write_batch.write_batch.complete(Ok(())) { + // remove from in flight batches + let mut in_flight_guard = self.in_flight_batches.lock(); + if let Some(in_flight) = in_flight_guard.get_mut(&ready_write_batch.table_bucket) { + in_flight.retain(|b| !Arc::ptr_eq(b, ready_write_batch)); + if in_flight.is_empty() { + in_flight_guard.remove(&ready_write_batch.table_bucket); + } + } + // remove from incomplete batches + self.accumulator + .remove_incomplete_batches(ready_write_batch.write_batch.batch_id()) + } + } + + pub async fn close(&mut self) { + self.running = false; + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/writer_client.rs b/fluss-rust/crates/fluss/src/client/write/writer_client.rs new file mode 100644 index 0000000000..01fe2899ba --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/writer_client.rs @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::metadata::Metadata; +use crate::client::write::bucket_assigner::{BucketAssigner, StickyBucketAssigner}; +use crate::client::write::sender::Sender; +use crate::client::{RecordAccumulator, ResultHandle, WriteRecord}; +use crate::config::Config; +use crate::metadata::TablePath; +use dashmap::DashMap; +use std::sync::Arc; +use tokio::sync::mpsc; +use tokio::task::JoinHandle; + +use crate::error::{Error, Result}; + +#[allow(dead_code)] +pub struct WriterClient { + config: Config, + max_request_size: i32, + accumulate: Arc, + shutdown_tx: mpsc::Sender<()>, + sender_join_handle: JoinHandle<()>, + metadata: Arc, + bucket_assigners: DashMap>>, +} + +impl WriterClient { + pub fn new(config: Config, metadata: Arc) -> Result { + let (shutdown_tx, mut shutdown_rx) = mpsc::channel(1); + + let accumulator = Arc::new(RecordAccumulator::new(config.clone())); + + let mut sender = Sender::new( + metadata.clone(), + accumulator.clone(), + config.request_max_size, + 30_000, + Self::get_ack(&config)?, + config.writer_retries, + ); + + let join_handle = tokio::spawn(async move { + tokio::select! { + _ = sender.run() => { + // do-nothing + }, + _ = shutdown_rx.recv() => { + sender.close().await + } + } + }); + + Ok(Self { + max_request_size: config.request_max_size, + config, + shutdown_tx, + sender_join_handle: join_handle, + accumulate: accumulator, + metadata, + bucket_assigners: Default::default(), + }) + } + + fn get_ack(config: &Config) -> Result { + let acks = config.writer_acks.as_str(); + if acks.eq("all") { + Ok(-1) + } else { + acks.parse::() + .map_err(|e| Error::IllegalArgument(e.to_string())) + } + } + + pub async fn send(&self, record: &WriteRecord<'_>) -> Result { + let table_path = &record.table_path; + let cluster = self.metadata.get_cluster(); + + let bucket_assigner = { + if let Some(assigner) = self.bucket_assigners.get(table_path) { + assigner.clone() + } else { + let assigner = Arc::new(Self::create_bucket_assigner(table_path.as_ref())); + self.bucket_assigners + .insert(table_path.as_ref().clone(), assigner.clone()); + assigner + } + }; + + let bucket_id = bucket_assigner.assign_bucket(None, &cluster); + + let mut result = self.accumulate.append(record, 1, &cluster, true).await?; + + if result.abort_record_for_new_batch { + let prev_bucket_id = bucket_id; + bucket_assigner.on_new_batch(&cluster, prev_bucket_id); + let bucket_id = bucket_assigner.assign_bucket(None, &cluster); + result = self + .accumulate + .append(record, bucket_id, &cluster, false) + .await?; + } + + if result.batch_is_full || result.new_batch_created { + // todo: wakeup + } + + Ok(result.result_handle.expect("result_handle should exist")) + } + + pub async fn close(self) -> Result<()> { + self.shutdown_tx + .send(()) + .await + .map_err(|e| Error::WriteError(e.to_string()))?; + + self.sender_join_handle + .await + .map_err(|e| Error::WriteError(e.to_string()))?; + Ok(()) + } + + pub async fn flush(&self) -> Result<()> { + self.accumulate.begin_flush(); + self.accumulate.await_flush_completion().await?; + Ok(()) + } + + pub fn create_bucket_assigner(table_path: &TablePath) -> Box { + // always sticky + Box::new(StickyBucketAssigner::new(table_path.clone())) + } +} diff --git a/fluss-rust/crates/fluss/src/cluster/cluster.rs b/fluss-rust/crates/fluss/src/cluster/cluster.rs new file mode 100644 index 0000000000..1f8341dd66 --- /dev/null +++ b/fluss-rust/crates/fluss/src/cluster/cluster.rs @@ -0,0 +1,243 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::BucketId; +use crate::cluster::{BucketLocation, ServerNode, ServerType}; +use crate::error::Result; +use crate::metadata::{JsonSerde, TableBucket, TableDescriptor, TableInfo, TablePath}; +use crate::proto::MetadataResponse; +use crate::rpc::{from_pb_server_node, from_pb_table_path}; +use rand::random_range; +use std::collections::HashMap; + +static EMPTY: Vec = Vec::new(); + +#[derive(Default)] +pub struct Cluster { + coordinator_server: Option, + alive_tablet_servers_by_id: HashMap, + alive_tablet_servers: Vec, + available_locations_by_path: HashMap>, + available_locations_by_bucket: HashMap, + table_id_by_path: HashMap, + table_path_by_id: HashMap, + table_info_by_path: HashMap, +} + +impl Cluster { + pub fn new( + coordinator_server: Option, + alive_tablet_servers_by_id: HashMap, + available_locations_by_path: HashMap>, + available_locations_by_bucket: HashMap, + table_id_by_path: HashMap, + table_info_by_path: HashMap, + ) -> Self { + let alive_tablet_servers = alive_tablet_servers_by_id.values().cloned().collect(); + let table_path_by_id = table_id_by_path + .iter() + .map(|(path, table_id)| (*table_id, path.clone())) + .collect(); + Cluster { + coordinator_server, + alive_tablet_servers_by_id, + alive_tablet_servers, + available_locations_by_path, + available_locations_by_bucket, + table_id_by_path, + table_path_by_id, + table_info_by_path, + } + } + + pub fn update(&mut self, cluster: Cluster) { + let Cluster { + coordinator_server, + alive_tablet_servers_by_id, + alive_tablet_servers, + available_locations_by_path, + available_locations_by_bucket, + table_id_by_path, + table_path_by_id, + table_info_by_path, + } = cluster; + self.coordinator_server = coordinator_server; + self.alive_tablet_servers_by_id = alive_tablet_servers_by_id; + self.alive_tablet_servers = alive_tablet_servers; + self.available_locations_by_path = available_locations_by_path; + self.available_locations_by_bucket = available_locations_by_bucket; + self.table_id_by_path = table_id_by_path; + self.table_path_by_id = table_path_by_id; + self.table_info_by_path = table_info_by_path; + } + + pub fn from_metadata_response( + metadata_response: MetadataResponse, + origin_cluster: Option<&Cluster>, + ) -> Result { + let mut servers = HashMap::with_capacity(metadata_response.tablet_servers.len()); + for pb_server in metadata_response.tablet_servers { + let server_id = pb_server.node_id; + let server_node = from_pb_server_node(pb_server, ServerType::TabletServer); + servers.insert(server_id, server_node); + } + + let coordinator_server = metadata_response + .coordinator_server + .map(|node| from_pb_server_node(node, ServerType::CoordinatorServer)); + + let mut table_id_by_path = HashMap::new(); + let mut table_info_by_path = HashMap::new(); + if let Some(origin) = origin_cluster { + table_info_by_path.extend(origin.get_table_info_by_path().clone()); + table_id_by_path.extend(origin.get_table_id_by_path().clone()); + } + + // Index the bucket locations by table path, and index bucket location by bucket + let mut tmp_available_location_by_bucket = HashMap::new(); + let mut tmp_available_locations_by_path = HashMap::new(); + + for table_metadata in metadata_response.table_metadata { + let table_id = table_metadata.table_id; + let table_path = from_pb_table_path(&table_metadata.table_path); + let table_descriptor = TableDescriptor::deserialize_json( + &serde_json::from_slice(table_metadata.table_json.as_slice()).unwrap(), + )?; + let table_info = TableInfo::of( + table_path.clone(), + table_id, + table_metadata.schema_id, + table_descriptor, + table_metadata.created_time, + table_metadata.modified_time, + ); + table_info_by_path.insert(table_path.clone(), table_info); + + // now, get bucket matadata + let mut found_unavailable_bucket = false; + let mut available_bucket_for_table = vec![]; + let mut bucket_for_table = vec![]; + for bucket_metadata in table_metadata.bucket_metadata { + let bucket_id = bucket_metadata.bucket_id; + let bucket = TableBucket::new(table_id, bucket_id); + let bucket_location; + if let Some(leader_id) = bucket_metadata.leader_id + && let Some(server_node) = servers.get(&leader_id) + { + bucket_location = BucketLocation::new( + bucket.clone(), + Some(server_node.clone()), + table_path.clone(), + ); + available_bucket_for_table.push(bucket_location.clone()); + tmp_available_location_by_bucket + .insert(bucket.clone(), bucket_location.clone()); + } else { + found_unavailable_bucket = true; + bucket_location = BucketLocation::new(bucket.clone(), None, table_path.clone()); + } + bucket_for_table.push(bucket_location.clone()); + } + + if found_unavailable_bucket { + tmp_available_locations_by_path + .insert(table_path.clone(), available_bucket_for_table.clone()); + } else { + tmp_available_locations_by_path.insert(table_path.clone(), bucket_for_table); + } + } + Ok(Cluster::new( + coordinator_server, + servers, + tmp_available_locations_by_path, + tmp_available_location_by_bucket, + table_id_by_path, + table_info_by_path, + )) + } + + pub fn get_coordinator_server(&self) -> Option<&ServerNode> { + self.coordinator_server.as_ref() + } + + pub fn leader_for(&self, table_bucket: &TableBucket) -> Option<&ServerNode> { + let location = self.available_locations_by_bucket.get(table_bucket); + if let Some(location) = location { + location.leader().as_ref() + } else { + None + } + } + + pub fn get_tablet_server(&self, id: i32) -> Option<&ServerNode> { + self.alive_tablet_servers_by_id.get(&id) + } + + pub fn get_table_bucket(&self, table_path: &TablePath, bucket_id: BucketId) -> TableBucket { + let table_info = self.get_table(table_path); + TableBucket::new(table_info.table_id, bucket_id) + } + + pub fn get_bucket_locations_by_path(&self) -> &HashMap> { + &self.available_locations_by_path + } + + pub fn get_table_info_by_path(&self) -> &HashMap { + &self.table_info_by_path + } + + pub fn get_table_id_by_path(&self) -> &HashMap { + &self.table_id_by_path + } + + pub fn get_available_buckets_for_table_path( + &self, + table_path: &TablePath, + ) -> &Vec { + self.available_locations_by_path + .get(table_path) + .unwrap_or(&EMPTY) + } + + pub fn get_one_available_server(&self) -> &ServerNode { + assert!( + !self.alive_tablet_servers.is_empty(), + "no alive tablet server in cluster" + ); + let offset = random_range(0..self.alive_tablet_servers.len()); + self.alive_tablet_servers + .get(offset) + .unwrap_or_else(|| panic!("can't find alive tab server by offset {offset}")) + } + + pub fn get_bucket_count(&self, table_path: &TablePath) -> i32 { + self.table_info_by_path + .get(table_path) + .unwrap_or_else(|| panic!("can't not table info by path {table_path}")) + .num_buckets + } + + pub fn get_table(&self, table_path: &TablePath) -> &TableInfo { + self.table_info_by_path + .get(table_path) + .unwrap_or_else(|| panic!("can't find table info by path {table_path}")) + } + + pub fn opt_get_table(&self, table_path: &TablePath) -> Option<&TableInfo> { + self.table_info_by_path.get(table_path) + } +} diff --git a/fluss-rust/crates/fluss/src/cluster/mod.rs b/fluss-rust/crates/fluss/src/cluster/mod.rs new file mode 100644 index 0000000000..f9d42e4453 --- /dev/null +++ b/fluss-rust/crates/fluss/src/cluster/mod.rs @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::BucketId; +use crate::metadata::{TableBucket, TablePath}; + +#[allow(clippy::module_inception)] +mod cluster; + +pub use cluster::Cluster; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ServerNode { + id: i32, + uid: String, + host: String, + port: u32, + server_type: ServerType, +} + +impl ServerNode { + pub fn new(id: i32, host: String, port: u32, server_type: ServerType) -> ServerNode { + ServerNode { + id, + uid: match server_type { + ServerType::CoordinatorServer => format!("cs-{id}"), + ServerType::TabletServer => format!("ts-{id}"), + }, + host, + port, + server_type, + } + } + + pub fn uid(&self) -> &String { + &self.uid + } + + pub fn url(&self) -> String { + format!("{}:{}", self.host, self.port) + } + + pub fn id(&self) -> i32 { + self.id + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum ServerType { + TabletServer, + CoordinatorServer, +} + +#[derive(Debug, Clone)] +pub struct BucketLocation { + pub table_bucket: TableBucket, + leader: Option, + pub table_path: TablePath, +} + +impl BucketLocation { + pub fn new( + table_bucket: TableBucket, + leader: Option, + table_path: TablePath, + ) -> BucketLocation { + BucketLocation { + table_bucket, + leader, + table_path, + } + } + + pub fn leader(&self) -> &Option { + &self.leader + } + + pub fn table_bucket(&self) -> &TableBucket { + &self.table_bucket + } + + pub fn bucket_id(&self) -> BucketId { + self.table_bucket.bucket_id() + } +} diff --git a/fluss-rust/crates/fluss/src/config.rs b/fluss-rust/crates/fluss/src/config.rs new file mode 100644 index 0000000000..08574965f4 --- /dev/null +++ b/fluss-rust/crates/fluss/src/config.rs @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use clap::Parser; +use serde::{Deserialize, Serialize}; + +#[derive(Parser, Debug, Clone, Deserialize, Serialize, Default)] +#[command(author, version, about, long_about = None)] +pub struct Config { + #[arg(long)] + #[serde(skip_serializing_if = "Option::is_none")] + pub bootstrap_server: Option, + + #[arg(long, default_value_t = 10 * 1024 * 1024)] + pub request_max_size: i32, + + #[arg(long, default_value_t = String::from("all"))] + pub writer_acks: String, + + #[arg(long, default_value_t = i32::MAX)] + pub writer_retries: i32, + + #[arg(long, default_value_t = 2 * 1024 * 1024)] + pub writer_batch_size: i32, +} diff --git a/fluss-rust/crates/fluss/src/error.rs b/fluss-rust/crates/fluss/src/error.rs new file mode 100644 index 0000000000..58b88a4786 --- /dev/null +++ b/fluss-rust/crates/fluss/src/error.rs @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::rpc::RpcError; +use arrow_schema::ArrowError; +use std::{io, result}; +use thiserror::Error; + +pub type Result = result::Result; + +#[derive(Debug, Error)] +pub enum Error { + #[error(transparent)] + Io(#[from] io::Error), + + #[error("Invalid table")] + InvalidTableError(String), + + #[error("Json serde error")] + JsonSerdeError(String), + + #[error("Rpc error")] + RpcError(#[from] RpcError), + + #[error("Row convert error")] + RowConvertError(String), + + #[error("arrow error")] + ArrowError(#[from] ArrowError), + + #[error("Write error: {0}")] + WriteError(String), + + #[error("Illegal argument error: {0}")] + IllegalArgument(String), +} diff --git a/fluss-rust/crates/fluss/src/lib.rs b/fluss-rust/crates/fluss/src/lib.rs new file mode 100644 index 0000000000..e63b5edfd8 --- /dev/null +++ b/fluss-rust/crates/fluss/src/lib.rs @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod client; +pub mod metadata; +pub mod record; +pub mod row; +pub mod rpc; + +mod cluster; + +pub mod config; +pub mod error; + +mod util; + +pub type TableId = u64; +pub type PartitionId = u64; +pub type BucketId = i32; + +pub mod proto { + include!(concat!(env!("OUT_DIR"), "/proto.rs")); +} diff --git a/fluss-rust/crates/fluss/src/metadata/datatype.rs b/fluss-rust/crates/fluss/src/metadata/datatype.rs new file mode 100644 index 0000000000..0c00c6f08e --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/datatype.rs @@ -0,0 +1,814 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use serde::{Deserialize, Serialize}; +use std::fmt::{Display, Formatter}; + +/// Data type for Fluss table. +/// Impl reference: +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum DataType { + Boolean(BooleanType), + TinyInt(TinyIntType), + SmallInt(SmallIntType), + Int(IntType), + BigInt(BigIntType), + Float(FloatType), + Double(DoubleType), + Char(CharType), + String(StringType), + Decimal(DecimalType), + Date(DateType), + Time(TimeType), + Timestamp(TimestampType), + TimestampLTz(TimestampLTzType), + Bytes(BytesType), + Binary(BinaryType), + Array(ArrayType), + Map(MapType), + Row(RowType), +} + +impl DataType { + pub fn is_nullable(&self) -> bool { + match self { + DataType::Boolean(v) => v.nullable, + DataType::TinyInt(v) => v.nullable, + DataType::SmallInt(v) => v.nullable, + DataType::Int(v) => v.nullable, + DataType::BigInt(v) => v.nullable, + DataType::Decimal(v) => v.nullable, + DataType::Double(v) => v.nullable, + DataType::Float(v) => v.nullable, + DataType::Binary(v) => v.nullable, + DataType::Char(v) => v.nullable, + DataType::String(v) => v.nullable, + DataType::Date(v) => v.nullable, + DataType::TimestampLTz(v) => v.nullable, + DataType::Time(v) => v.nullable, + DataType::Timestamp(v) => v.nullable, + DataType::Array(v) => v.nullable, + DataType::Map(v) => v.nullable, + DataType::Row(v) => v.nullable, + DataType::Bytes(v) => v.nullable, + } + } + + pub fn as_non_nullable(&self) -> Self { + match self { + DataType::Boolean(v) => DataType::Boolean(v.as_non_nullable()), + DataType::TinyInt(v) => DataType::TinyInt(v.as_non_nullable()), + DataType::SmallInt(v) => DataType::SmallInt(v.as_non_nullable()), + DataType::Int(v) => DataType::Int(v.as_non_nullable()), + DataType::BigInt(v) => DataType::BigInt(v.as_non_nullable()), + DataType::Decimal(v) => DataType::Decimal(v.as_non_nullable()), + DataType::Double(v) => DataType::Double(v.as_non_nullable()), + DataType::Float(v) => DataType::Float(v.as_non_nullable()), + DataType::Binary(v) => DataType::Binary(v.as_non_nullable()), + DataType::Char(v) => DataType::Char(v.as_non_nullable()), + DataType::String(v) => DataType::String(v.as_non_nullable()), + DataType::Date(v) => DataType::Date(v.as_non_nullable()), + DataType::TimestampLTz(v) => DataType::TimestampLTz(v.as_non_nullable()), + DataType::Time(v) => DataType::Time(v.as_non_nullable()), + DataType::Timestamp(v) => DataType::Timestamp(v.as_non_nullable()), + DataType::Array(v) => DataType::Array(v.as_non_nullable()), + DataType::Map(v) => DataType::Map(v.as_non_nullable()), + DataType::Row(v) => DataType::Row(v.as_non_nullable()), + DataType::Bytes(v) => DataType::Bytes(v.as_non_nullable()), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct BooleanType { + nullable: bool, +} + +impl Default for BooleanType { + fn default() -> Self { + Self::new() + } +} + +impl BooleanType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct TinyIntType { + nullable: bool, +} + +impl Default for TinyIntType { + fn default() -> Self { + Self::new() + } +} + +impl TinyIntType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct SmallIntType { + nullable: bool, +} + +impl Default for SmallIntType { + fn default() -> Self { + Self::new() + } +} + +impl SmallIntType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct IntType { + nullable: bool, +} + +impl Default for IntType { + fn default() -> Self { + Self::new() + } +} + +impl IntType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct BigIntType { + nullable: bool, +} + +impl Default for BigIntType { + fn default() -> Self { + Self::new() + } +} + +impl BigIntType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct FloatType { + nullable: bool, +} + +impl Default for FloatType { + fn default() -> Self { + Self::new() + } +} + +impl FloatType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct DoubleType { + nullable: bool, +} + +impl Default for DoubleType { + fn default() -> Self { + Self::new() + } +} + +impl DoubleType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct CharType { + nullable: bool, + length: u32, +} + +impl CharType { + pub fn new(length: u32) -> Self { + Self::with_nullable(length, true) + } + + pub fn with_nullable(length: u32, nullable: bool) -> Self { + Self { nullable, length } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(self.length, false) + } + + pub fn length(&self) -> u32 { + self.length + } +} + +impl Display for CharType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "CHAR({})", self.length)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct StringType { + nullable: bool, +} + +impl Default for StringType { + fn default() -> Self { + Self::new() + } +} + +impl StringType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct DecimalType { + nullable: bool, + precision: u32, + scale: u32, +} + +impl DecimalType { + pub const MIN_PRECISION: u32 = 1; + + pub const MAX_PRECISION: u32 = 38; + + pub const DEFAULT_PRECISION: u32 = 10; + + pub const MIN_SCALE: u32 = 0; + + pub const DEFAULT_SCALE: u32 = 0; + + pub fn new(precision: u32, scale: u32) -> Self { + Self::with_nullable(true, precision, scale) + } + + pub fn with_nullable(nullable: bool, precision: u32, scale: u32) -> Self { + DecimalType { + nullable, + precision, + scale, + } + } + + pub fn precision(&self) -> u32 { + self.precision + } + + pub fn scale(&self) -> u32 { + self.scale + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false, self.precision, self.scale) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct DateType { + nullable: bool, +} + +impl Default for DateType { + fn default() -> Self { + Self::new() + } +} + +impl DateType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +#[derive(Default, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct TimeType { + nullable: bool, + precision: u32, +} + +impl TimeType { + fn default() -> Self { + Self::new(Self::DEFAULT_PRECISION) + } +} + +impl TimeType { + pub const MIN_PRECISION: u32 = 0; + + pub const MAX_PRECISION: u32 = 9; + + pub const DEFAULT_PRECISION: u32 = 0; + + pub fn new(precision: u32) -> Self { + Self::with_nullable(true, precision) + } + + pub fn with_nullable(nullable: bool, precision: u32) -> Self { + TimeType { + nullable, + precision, + } + } + + pub fn precision(&self) -> u32 { + self.precision + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false, self.precision) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct TimestampType { + nullable: bool, + precision: u32, +} + +impl Default for TimestampType { + fn default() -> Self { + Self::new(Self::DEFAULT_PRECISION) + } +} + +impl TimestampType { + pub const MIN_PRECISION: u32 = 0; + + pub const MAX_PRECISION: u32 = 9; + + pub const DEFAULT_PRECISION: u32 = 6; + + pub fn new(precision: u32) -> Self { + Self::with_nullable(true, precision) + } + + pub fn with_nullable(nullable: bool, precision: u32) -> Self { + TimestampType { + nullable, + precision, + } + } + + pub fn precision(&self) -> u32 { + self.precision + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false, self.precision) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct TimestampLTzType { + nullable: bool, + precision: u32, +} + +impl Default for TimestampLTzType { + fn default() -> Self { + Self::new(Self::DEFAULT_PRECISION) + } +} + +impl TimestampLTzType { + pub const MIN_PRECISION: u32 = 0; + + pub const MAX_PRECISION: u32 = 9; + + pub const DEFAULT_PRECISION: u32 = 6; + + pub fn new(precision: u32) -> Self { + Self::with_nullable(true, precision) + } + + pub fn with_nullable(nullable: bool, precision: u32) -> Self { + TimestampLTzType { + nullable, + precision, + } + } + + pub fn precision(&self) -> u32 { + self.precision + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false, self.precision) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct BytesType { + nullable: bool, +} + +impl Default for BytesType { + fn default() -> Self { + Self::new() + } +} + +impl BytesType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct BinaryType { + nullable: bool, + length: usize, +} + +impl BinaryType { + pub const MIN_LENGTH: usize = 1; + + pub const MAX_LENGTH: usize = usize::MAX; + + pub const DEFAULT_LENGTH: usize = 1; + + pub fn new(length: usize) -> Self { + Self::with_nullable(true, length) + } + + pub fn with_nullable(nullable: bool, length: usize) -> Self { + Self { nullable, length } + } + + pub fn length(&self) -> usize { + self.length + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false, self.length) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct ArrayType { + nullable: bool, + element_type: Box, +} + +impl ArrayType { + pub fn new(element_type: DataType) -> Self { + Self::with_nullable(true, element_type) + } + + pub fn with_nullable(nullable: bool, element_type: DataType) -> Self { + Self { + nullable, + element_type: Box::new(element_type), + } + } + + pub fn as_non_nullable(&self) -> Self { + Self { + nullable: false, + element_type: self.element_type.clone(), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] +pub struct MapType { + nullable: bool, + key_type: Box, + value_type: Box, +} + +impl MapType { + pub fn new(key_type: DataType, value_type: DataType) -> Self { + Self::with_nullable(true, key_type, value_type) + } + + pub fn with_nullable(nullable: bool, key_type: DataType, value_type: DataType) -> Self { + Self { + nullable, + key_type: Box::new(key_type), + value_type: Box::new(value_type), + } + } + + pub fn as_non_nullable(&self) -> Self { + Self { + nullable: false, + key_type: self.key_type.clone(), + value_type: self.value_type.clone(), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] +pub struct RowType { + nullable: bool, + fields: Vec, +} + +impl RowType { + pub const fn new(fields: Vec) -> Self { + Self::with_nullable(true, fields) + } + + pub const fn with_nullable(nullable: bool, fields: Vec) -> Self { + Self { nullable, fields } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false, self.fields.clone()) + } + + pub fn fields(&self) -> &Vec { + &self.fields + } +} + +pub struct DataTypes; + +impl DataTypes { + pub fn binary(length: usize) -> DataType { + DataType::Binary(BinaryType::new(length)) + } + + pub fn bytes() -> DataType { + DataType::Bytes(BytesType::new()) + } + + pub fn boolean() -> DataType { + DataType::Boolean(BooleanType::new()) + } + + pub fn int() -> DataType { + DataType::Int(IntType::new()) + } + + /// Data type of a 1-byte signed integer with values from -128 to 127. + pub fn tinyint() -> DataType { + DataType::TinyInt(TinyIntType::new()) + } + + /// Data type of a 2-byte signed integer with values from -32,768 to 32,767. + pub fn smallint() -> DataType { + DataType::SmallInt(SmallIntType::new()) + } + + pub fn bigint() -> DataType { + DataType::BigInt(BigIntType::new()) + } + + /// Data type of a 4-byte single precision floating point number. + pub fn float() -> DataType { + DataType::Float(FloatType::new()) + } + + /// Data type of an 8-byte double precision floating point number. + pub fn double() -> DataType { + DataType::Double(DoubleType::new()) + } + + pub fn char(length: u32) -> DataType { + DataType::Char(CharType::new(length)) + } + + /// Data type of a variable-length character string. + pub fn string() -> DataType { + DataType::String(StringType::new()) + } + + /// Data type of a decimal number with fixed precision and scale `DECIMAL(p, s)` where + /// `p` is the number of digits in a number (=precision) and `s` is the number of + /// digits to the right of the decimal point in a number (=scale). `p` must have a value + /// between 1 and 38 (both inclusive). `s` must have a value between 0 and `p` (both inclusive). + pub fn decimal(precision: u32, scale: u32) -> DataType { + DataType::Decimal(DecimalType::new(precision, scale)) + } + + pub fn date() -> DataType { + DataType::Date(DateType::new()) + } + + /// Data type of a time WITHOUT time zone `TIME` with no fractional seconds by default. + pub fn time() -> DataType { + DataType::Time(TimeType::default()) + } + + /// Data type of a time WITHOUT time zone `TIME(p)` where `p` is the number of digits + /// of fractional seconds (=precision). `p` must have a value between 0 and 9 (both inclusive). + pub fn time_with_precision(precision: u32) -> DataType { + DataType::Time(TimeType::new(precision)) + } + + /// Data type of a timestamp WITHOUT time zone `TIMESTAMP` with 6 digits of fractional + /// seconds by default. + pub fn timestamp() -> DataType { + DataType::Timestamp(TimestampType::default()) + } + + /// Data type of a timestamp WITHOUT time zone `TIMESTAMP(p)` where `p` is the number + /// of digits of fractional seconds (=precision). `p` must have a value between 0 and 9 + /// (both inclusive). + pub fn timestamp_with_precision(precision: u32) -> DataType { + DataType::Timestamp(TimestampType::new(precision)) + } + + /// Data type of a timestamp WITH time zone `TIMESTAMP WITH TIME ZONE` with 6 digits of + /// fractional seconds by default. + pub fn timestamp_ltz() -> DataType { + DataType::TimestampLTz(TimestampLTzType::default()) + } + + /// Data type of a timestamp WITH time zone `TIMESTAMP WITH TIME ZONE(p)` where `p` is the number + /// of digits of fractional seconds (=precision). `p` must have a value between 0 and 9 (both inclusive). + pub fn timestamp_ltz_with_precision(precision: u32) -> DataType { + DataType::TimestampLTz(TimestampLTzType::new(precision)) + } + + /// Data type of an array of elements with same subtype. + pub fn array(element: DataType) -> DataType { + DataType::Array(ArrayType::new(element)) + } + + /// Data type of an associative array that maps keys to values. + pub fn map(key_type: DataType, value_type: DataType) -> DataType { + DataType::Map(MapType::new(key_type, value_type)) + } + + /// Field definition with field name and data type. + pub fn field(name: String, data_type: DataType) -> DataField { + DataField::new(name, data_type, None) + } + + /// Field definition with field name, data type, and a description. + pub fn field_with_description( + name: String, + data_type: DataType, + description: String, + ) -> DataField { + DataField::new(name, data_type, Some(description)) + } + + /// Data type of a sequence of fields. + pub fn row(fields: Vec) -> DataType { + DataType::Row(RowType::new(fields)) + } + + /// Data type of a sequence of fields with generated field names (f0, f1, f2, ...). + pub fn row_from_types(field_types: Vec) -> DataType { + let fields = field_types + .into_iter() + .enumerate() + .map(|(i, dt)| DataField::new(format!("f{i}"), dt, None)) + .collect(); + DataType::Row(RowType::new(fields)) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct DataField { + pub name: String, + pub data_type: DataType, + pub description: Option, +} + +impl DataField { + pub fn new(name: String, data_type: DataType, description: Option) -> DataField { + DataField { + name, + data_type, + description, + } + } + + pub fn name(&self) -> &str { + &self.name + } + + pub fn data_type(&self) -> &DataType { + &self.data_type + } +} + +// todo: implement display for datatype diff --git a/fluss-rust/crates/fluss/src/metadata/json_serde.rs b/fluss-rust/crates/fluss/src/metadata/json_serde.rs new file mode 100644 index 0000000000..1c7604c98a --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/json_serde.rs @@ -0,0 +1,464 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::{InvalidTableError, JsonSerdeError}; +use crate::error::Result; +use crate::metadata::datatype::{DataType, DataTypes}; +use crate::metadata::table::{Column, Schema, TableDescriptor}; +use serde_json::{Value, json}; +use std::collections::HashMap; + +pub trait JsonSerde: Sized { + fn serialize_json(&self) -> Result; + + fn deserialize_json(node: &Value) -> Result; +} + +impl DataType { + pub fn to_type_root(&self) -> &str { + match &self { + DataType::Boolean(_) => "BOOLEAN", + DataType::TinyInt(_) => "TINYINT", + DataType::SmallInt(_) => "SMALLINT", + DataType::Int(_) => "INTEGER", + DataType::BigInt(_) => "BIGINT", + DataType::Float(_) => "FLOAT", + DataType::Double(_) => "DOUBLE", + DataType::Char(_) => "CHAR", + DataType::String(_) => "STRING", + DataType::Decimal(_) => "DECIMAL", + DataType::Date(_) => "DATE", + DataType::Time(_) => "TIME_WITHOUT_TIME_ZONE", + DataType::Timestamp(_) => "TIMESTAMP_WITHOUT_TIME_ZONE", + DataType::TimestampLTz(_) => "TIMESTAMP_WITH_LOCAL_TIME_ZONE", + DataType::Bytes(_) => "BYTES", + DataType::Binary(_) => "BINARY", + DataType::Array(_) => "ARRAY", + DataType::Map(_) => "MAP", + DataType::Row(_) => "ROW", + } + } +} + +impl DataType { + const FIELD_NAME_TYPE_NAME: &'static str = "type"; + const FIELD_NAME_NULLABLE: &'static str = "nullable"; + const FIELD_NAME_LENGTH: &'static str = "length"; + #[allow(dead_code)] + const FIELD_NAME_PRECISION: &'static str = "precision"; + #[allow(dead_code)] + const FILED_NAME_SCALE: &'static str = "scale"; + #[allow(dead_code)] + const FIELD_NAME_ELEMENT_TYPE: &'static str = "element_type"; + #[allow(dead_code)] + const FIELD_NAME_KEY_TYPE: &'static str = "key_type"; + #[allow(dead_code)] + const FIELD_NAME_VALUE_TYPE: &'static str = "value_type"; + #[allow(dead_code)] + const FIELD_NAME_FIELDS: &'static str = "fields"; + #[allow(dead_code)] + const FIELD_NAME_FIELD_NAME: &'static str = "name"; + // ROW + #[allow(dead_code)] + const FIELD_NAME_FIELD_TYPE: &'static str = "field_type"; + #[allow(dead_code)] + const FIELD_NAME_FIELD_DESCRIPTION: &'static str = "description"; +} + +impl JsonSerde for DataType { + fn serialize_json(&self) -> Result { + let mut obj = serde_json::Map::new(); + + obj.insert( + Self::FIELD_NAME_TYPE_NAME.to_string(), + json!(Self::to_type_root(self)), + ); + if !self.is_nullable() { + obj.insert(Self::FIELD_NAME_NULLABLE.to_string(), json!(false)); + } + + match &self { + DataType::Boolean(_) + | DataType::TinyInt(_) + | DataType::SmallInt(_) + | DataType::Int(_) + | DataType::BigInt(_) + | DataType::Float(_) + | DataType::Double(_) + | DataType::String(_) + | DataType::Bytes(_) + | DataType::Date(_) => { + // do nothing + } + DataType::Char(_type) => { + obj.insert(Self::FIELD_NAME_LENGTH.to_string(), json!(_type.length())); + } + DataType::Binary(_type) => { + obj.insert(Self::FIELD_NAME_LENGTH.to_string(), json!(_type.length())); + } + DataType::Decimal(_type) => { + todo!() + } + + DataType::Time(_type) => { + todo!() + } + DataType::Timestamp(_type) => { + todo!() + } + DataType::TimestampLTz(_type) => { + todo!() + } + DataType::Array(_type) => todo!(), + DataType::Map(_type) => todo!(), + DataType::Row(_type) => todo!(), + } + Ok(Value::Object(obj)) + } + + fn deserialize_json(node: &Value) -> Result { + let mut _is_nullable = true; + let type_root = node + .get(Self::FIELD_NAME_TYPE_NAME) + .and_then(|v| v.as_str()) + .ok_or_else(|| { + JsonSerdeError(format!( + "Couldn't find field {} while deserializing datatype.", + Self::FIELD_NAME_TYPE_NAME + )) + })?; + + let mut data_type = match type_root { + "BOOLEAN" => DataTypes::boolean(), + "TINYINT" => DataTypes::tinyint(), + "SMALLINT" => DataTypes::smallint(), + "INTEGER" => DataTypes::int(), + "BIGINT" => DataTypes::bigint(), + "FLOAT" => DataTypes::float(), + "DOUBLE" => DataTypes::double(), + "CHAR" => todo!(), + "STRING" => DataTypes::string(), + "DECIMAL" => todo!(), + "DATE" => DataTypes::date(), + "TIME_WITHOUT_TIME_ZONE" => todo!(), // Precision set separately + "TIMESTAMP_WITHOUT_TIME_ZONE" => todo!(), // Precision set separately + "TIMESTAMP_WITH_LOCAL_TIME_ZONE" => todo!(), // Precision set separately + "BYTES" => DataTypes::bytes(), + "BINARY" => todo!(), + "ARRAY" => todo!(), + "MAP" => todo!(), + "ROW" => todo!(), + _ => return Err(JsonSerdeError(format!("Unknown type root: {type_root}"))), + }; + + if let Some(nullable) = node.get(Self::FIELD_NAME_NULLABLE) { + let nullable_value = nullable.as_bool().unwrap_or(true); + if !nullable_value { + data_type = data_type.as_non_nullable(); + } + } + Ok(data_type) + } +} + +impl Column { + const NAME: &'static str = "name"; + const DATA_TYPE: &'static str = "data_type"; + const COMMENT: &'static str = "comment"; +} + +impl JsonSerde for Column { + fn serialize_json(&self) -> Result { + let mut obj = serde_json::Map::new(); + + // Common fields + obj.insert(Self::NAME.to_string(), json!(self.name())); + obj.insert( + Self::DATA_TYPE.to_string(), + self.data_type().serialize_json()?, + ); + + if let Some(comment) = &self.comment() { + obj.insert(Self::COMMENT.to_string(), json!(comment)); + } + + Ok(Value::Object(obj)) + } + + fn deserialize_json(node: &Value) -> Result { + let name = node + .get(Self::NAME) + .and_then(|v| v.as_str()) + .unwrap_or_else(|| panic!("{}", format!("Missing required field: {}", Self::NAME))) + .to_string(); + + let data_type_node = node.get(Self::DATA_TYPE).ok_or_else(|| { + JsonSerdeError(format!("Missing required field: {}", Self::DATA_TYPE)) + })?; + + let data_type = DataType::deserialize_json(data_type_node)?; + + let mut column = Column::new(&name, data_type); + + if let Some(comment) = node.get(Self::COMMENT).and_then(|v| v.as_str()) { + column = column.with_comment(comment); + } + + Ok(column) + } +} + +impl Schema { + const COLUMNS_NAME: &'static str = "columns"; + const PRIMARY_KEY_NAME: &'static str = "primary_key"; + const VERSION_KEY: &'static str = "version"; + const VERSION: u32 = 1; +} + +impl JsonSerde for Schema { + fn serialize_json(&self) -> Result { + let mut obj = serde_json::Map::new(); + + // Serialize version + obj.insert(Self::VERSION_KEY.to_string(), json!(Self::VERSION)); + + // Serialize columns + let columns: Vec = self + .columns() + .iter() + .map(|col| col.serialize_json()) + .collect::>()?; + obj.insert(Self::COLUMNS_NAME.to_string(), json!(columns)); + + // Serialize primary key if present + if let Some(primary_key) = &self.primary_key() { + let pk_values: Vec = primary_key + .column_names() + .iter() + .map(|name| json!(name)) + .collect(); + obj.insert(Self::PRIMARY_KEY_NAME.to_string(), json!(pk_values)); + } + Ok(Value::Object(obj)) + } + + fn deserialize_json(node: &Value) -> Result { + let columns_node = node + .get(Self::COLUMNS_NAME) + .ok_or_else(|| { + JsonSerdeError(format!("Missing required field: {}", Self::COLUMNS_NAME)) + })? + .as_array() + .ok_or_else(|| JsonSerdeError(format!("{} should be an array", Self::COLUMNS_NAME)))?; + + let mut columns = Vec::with_capacity(columns_node.len()); + for col_node in columns_node { + columns.push(Column::deserialize_json(col_node)?); + } + + let mut schema_builder = Schema::builder().with_columns(columns); + + if let Some(pk_node) = node.get(Self::PRIMARY_KEY_NAME) { + let pk_array = pk_node + .as_array() + .ok_or_else(|| InvalidTableError("Primary key is not an array".to_string()))?; + + let mut primary_keys = Vec::with_capacity(pk_array.len()); + for name_node in pk_array { + primary_keys.push( + name_node + .as_str() + .ok_or_else(|| InvalidTableError("Primary key is not string".to_string()))? + .to_string(), + ); + } + + schema_builder = schema_builder.primary_key(primary_keys); + } + + schema_builder.build() + } +} + +impl TableDescriptor { + const SCHEMA_NAME: &'static str = "schema"; + const COMMENT_NAME: &'static str = "comment"; + const PARTITION_KEY_NAME: &'static str = "partition_key"; + const BUCKET_KEY_NAME: &'static str = "bucket_key"; + const BUCKET_COUNT_NAME: &'static str = "bucket_count"; + const PROPERTIES_NAME: &'static str = "properties"; + const CUSTOM_PROPERTIES_NAME: &'static str = "custom_properties"; + const VERSION_KEY: &'static str = "version"; + const VERSION: u32 = 1; + + fn deserialize_properties(node: &Value) -> Result> { + let obj = node + .as_object() + .ok_or_else(|| JsonSerdeError("Properties should be an object".to_string()))?; + + let mut properties = HashMap::with_capacity(obj.len()); + for (key, value) in obj { + properties.insert( + key.clone(), + value + .as_str() + .ok_or_else(|| JsonSerdeError("Properties should be an object".to_string()))? + .to_owned(), + ); + } + + Ok(properties) + } +} + +impl JsonSerde for TableDescriptor { + fn serialize_json(&self) -> Result { + let mut obj = serde_json::Map::new(); + + // Serialize version + obj.insert(Self::VERSION_KEY.to_string(), json!(Self::VERSION)); + + // Serialize schema + obj.insert( + Self::SCHEMA_NAME.to_string(), + self.schema().serialize_json()?, + ); + + // Serialize comment if present + if let Some(comment) = &self.comment() { + obj.insert(Self::COMMENT_NAME.to_string(), json!(comment)); + } + + // Serialize partition keys + let partition_keys: Vec = + self.partition_keys().iter().map(|key| json!(key)).collect(); + obj.insert(Self::PARTITION_KEY_NAME.to_string(), json!(partition_keys)); + + // Serialize table distribution if present + if let Some(dist) = &self.table_distribution() { + let bucket_keys: Vec = dist.bucket_keys().iter().map(|key| json!(key)).collect(); + obj.insert(Self::BUCKET_KEY_NAME.to_string(), json!(bucket_keys)); + + if let Some(count) = dist.bucket_count() { + obj.insert(Self::BUCKET_COUNT_NAME.to_string(), json!(count)); + } + } + + // Serialize properties + obj.insert(Self::PROPERTIES_NAME.to_string(), json!(self.properties())); + + obj.insert( + Self::CUSTOM_PROPERTIES_NAME.to_string(), + json!(self.custom_properties()), + ); + + Ok(Value::Object(obj)) + } + + fn deserialize_json(node: &Value) -> Result { + let mut builder = TableDescriptor::builder(); + + // Deserialize schema + let schema_node = node.get(Self::SCHEMA_NAME).ok_or_else(|| { + JsonSerdeError(format!("Missing required field: {}", Self::SCHEMA_NAME)) + })?; + let schema = Schema::deserialize_json(schema_node)?; + builder = builder.schema(schema); + + // Deserialize comment if present + if let Some(comment_node) = node.get(Self::COMMENT_NAME) { + let comment = comment_node + .as_str() + .ok_or_else(|| { + JsonSerdeError(format!("{} should be a string", Self::COMMENT_NAME)) + })? + .to_owned(); + builder = builder.comment(comment.as_str()); + } + + let partition_node = node + .get(Self::PARTITION_KEY_NAME) + .ok_or_else(|| { + JsonSerdeError(format!( + "Missing required field: {}", + Self::PARTITION_KEY_NAME + )) + })? + .as_array() + .ok_or_else(|| { + JsonSerdeError(format!("{} should be an array", Self::PARTITION_KEY_NAME)) + })?; + + let mut partition_keys = Vec::with_capacity(partition_node.len()); + for key_node in partition_node { + partition_keys.push( + key_node + .as_str() + .ok_or_else(|| { + JsonSerdeError(format!("{} should be a string", Self::PARTITION_KEY_NAME)) + })? + .to_owned(), + ); + } + builder = builder.partitioned_by(partition_keys); + + let mut bucket_count = None; + let mut bucket_keys = vec![]; + if let Some(bucket_key_node) = node.get(Self::BUCKET_KEY_NAME) { + let bucket_key_node = bucket_key_node.as_array().ok_or_else(|| { + JsonSerdeError(format!("{} should be an array", Self::BUCKET_COUNT_NAME)) + })?; + + for key_node in bucket_key_node { + bucket_keys.push( + key_node + .as_str() + .ok_or_else(|| JsonSerdeError("Bucket key should be a string".to_string()))? + .to_owned(), + ); + } + } + + if let Some(bucket_count_node) = node.get(Self::BUCKET_COUNT_NAME) { + bucket_count = bucket_count_node.as_u64().map(|n| n as i32); + } + + if bucket_count.is_some() || !bucket_keys.is_empty() { + builder = builder.distributed_by(bucket_count, bucket_keys); + } + + // Deserialize properties + let properties = + Self::deserialize_properties(node.get(Self::PROPERTIES_NAME).ok_or_else(|| { + JsonSerdeError(format!("Missing required field: {}", Self::PROPERTIES_NAME)) + })?)?; + builder = builder.properties(properties); + + // Deserialize custom properties + let custom_properties = Self::deserialize_properties( + node.get(Self::CUSTOM_PROPERTIES_NAME).ok_or_else(|| { + JsonSerdeError(format!( + "Missing required field: {}", + Self::CUSTOM_PROPERTIES_NAME + )) + })?, + )?; + builder = builder.custom_properties(custom_properties); + + builder.build() + } +} diff --git a/fluss-rust/crates/fluss/src/metadata/mod.rs b/fluss-rust/crates/fluss/src/metadata/mod.rs new file mode 100644 index 0000000000..79465474f7 --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/mod.rs @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod datatype; +pub use datatype::*; +mod json_serde; +mod table; + +pub use json_serde::*; +pub use table::*; diff --git a/fluss-rust/crates/fluss/src/metadata/table.rs b/fluss-rust/crates/fluss/src/metadata/table.rs new file mode 100644 index 0000000000..a5ab61d67e --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/table.rs @@ -0,0 +1,920 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::InvalidTableError; +use crate::error::Result; +use crate::metadata::datatype::{DataField, DataType, RowType}; +use core::fmt; +use serde::{Deserialize, Serialize}; +use std::collections::{HashMap, HashSet}; +use std::fmt::{Display, Formatter}; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct Column { + name: String, + data_type: DataType, + comment: Option, +} + +impl Column { + pub fn new(name: &str, data_type: DataType) -> Self { + Self { + name: name.to_string(), + data_type, + comment: None, + } + } + + pub fn with_comment(mut self, comment: &str) -> Self { + self.comment = Some(comment.to_string()); + self + } + + pub fn with_data_type(&self, data_type: DataType) -> Self { + Self { + name: self.name.clone(), + data_type: data_type.clone(), + comment: self.comment.clone(), + } + } + + // Getters... + pub fn name(&self) -> &str { + &self.name + } + + pub fn data_type(&self) -> &DataType { + &self.data_type + } + + pub fn comment(&self) -> Option<&str> { + self.comment.as_deref() + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PrimaryKey { + constraint_name: String, + column_names: Vec, +} + +impl PrimaryKey { + pub fn new(constraint_name: &str, column_names: Vec) -> Self { + Self { + constraint_name: constraint_name.to_string(), + column_names, + } + } + + // Getters... + pub fn constraint_name(&self) -> &str { + &self.constraint_name + } + + pub fn column_names(&self) -> &[String] { + &self.column_names + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct Schema { + columns: Vec, + primary_key: Option, + // must be Row data type kind + row_type: DataType, +} + +impl Schema { + pub fn empty() -> Result { + Self::builder().build() + } + + pub fn builder() -> SchemaBuilder { + SchemaBuilder::new() + } + + pub fn columns(&self) -> &[Column] { + &self.columns + } + + pub fn primary_key(&self) -> Option<&PrimaryKey> { + self.primary_key.as_ref() + } + + pub fn row_type(&self) -> &DataType { + &self.row_type + } + + pub fn primary_key_indexes(&self) -> Vec { + self.primary_key + .as_ref() + .map(|pk| { + pk.column_names + .iter() + .filter_map(|name| self.columns.iter().position(|c| &c.name == name)) + .collect() + }) + .unwrap_or_default() + } + + pub fn primary_key_column_names(&self) -> Vec<&str> { + self.primary_key + .as_ref() + .map(|pk| pk.column_names.iter().map(|s| s.as_str()).collect()) + .unwrap_or_default() + } + + pub fn column_names(&self) -> Vec<&str> { + self.columns.iter().map(|c| c.name.as_str()).collect() + } +} + +#[derive(Debug, Default)] +pub struct SchemaBuilder { + columns: Vec, + primary_key: Option, +} + +impl SchemaBuilder { + pub fn new() -> Self { + Self::default() + } + + pub fn with_row_type(mut self, row_type: &DataType) -> Self { + match row_type { + DataType::Row(row) => { + for data_field in row.fields() { + self = self.column(&data_field.name, data_field.data_type.clone()) + } + self + } + _ => { + panic!("data type msut be row type") + } + } + } + + pub fn column(mut self, name: &str, data_type: DataType) -> Self { + self.columns.push(Column::new(name, data_type)); + self + } + + pub fn with_columns(mut self, columns: Vec) -> Self { + self.columns.extend_from_slice(columns.as_ref()); + self + } + + pub fn with_comment(mut self, comment: &str) -> Self { + if let Some(last) = self.columns.last_mut() { + *last = last.clone().with_comment(comment); + } + self + } + + pub fn primary_key(self, column_names: Vec) -> Self { + let constraint_name = format!("PK_{}", column_names.join("_")); + self.primary_key_named(&constraint_name, column_names) + } + + pub fn primary_key_named(mut self, constraint_name: &str, column_names: Vec) -> Self { + self.primary_key = Some(PrimaryKey::new(constraint_name, column_names)); + self + } + + pub fn build(&mut self) -> Result { + let columns = Self::normalize_columns(&mut self.columns, self.primary_key.as_ref())?; + + let data_fields = columns + .iter() + .map(|c| DataField { + name: c.name.clone(), + data_type: c.data_type.clone(), + description: c.comment.clone(), + }) + .collect(); + + Ok(Schema { + columns, + primary_key: self.primary_key.clone(), + row_type: DataType::Row(RowType::new(data_fields)), + }) + } + + fn normalize_columns( + columns: &mut [Column], + primary_key: Option<&PrimaryKey>, + ) -> Result> { + let names: Vec<_> = columns.iter().map(|c| &c.name).collect(); + if let Some(duplicates) = Self::find_duplicates(&names) { + return Err(InvalidTableError(format!( + "Duplicate column names found: {duplicates:?}" + ))); + } + + let Some(pk) = primary_key else { + return Ok(columns.to_vec()); + }; + + let pk_set: HashSet<_> = pk.column_names.iter().collect(); + let all_columns: HashSet<_> = columns.iter().map(|c| &c.name).collect(); + if !pk_set.is_subset(&all_columns) { + return Err(InvalidTableError(format!( + "Primary key columns {pk_set:?} not found in schema" + ))); + } + + Ok(columns + .iter() + .map(|col| { + if pk_set.contains(&col.name) && col.data_type.is_nullable() { + col.with_data_type(col.data_type.as_non_nullable()) + } else { + col.clone() + } + }) + .collect()) + } + + fn find_duplicates<'a>(names: &'a [&String]) -> Option> { + let mut seen = HashSet::new(); + let mut duplicates = HashSet::new(); + + for name in names { + if !seen.insert(name) { + duplicates.insert(*name); + } + } + + if duplicates.is_empty() { + None + } else { + Some(duplicates) + } + } +} + +/// distribution of table +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct TableDistribution { + bucket_count: Option, + bucket_keys: Vec, +} + +impl TableDistribution { + pub fn bucket_keys(&self) -> &[String] { + &self.bucket_keys + } + + pub fn bucket_count(&self) -> Option { + self.bucket_count + } +} + +#[derive(Debug, Default)] +pub struct TableDescriptorBuilder { + schema: Option, + properties: HashMap, + custom_properties: HashMap, + partition_keys: Vec, + comment: Option, + table_distribution: Option, +} + +impl TableDescriptorBuilder { + pub fn new() -> Self { + Self::default() + } + + pub fn schema(mut self, schema: Schema) -> Self { + self.schema = Some(schema); + self + } + + pub fn log_format(mut self, log_format: LogFormat) -> Self { + self.properties + .insert("table.log.format".to_string(), log_format.to_string()); + self + } + + pub fn kv_format(mut self, kv_format: KvFormat) -> Self { + self.properties + .insert("table.kv.format".to_string(), kv_format.to_string()); + self + } + + pub fn property(mut self, key: &str, value: T) -> Self { + self.properties.insert(key.to_string(), value.to_string()); + self + } + + pub fn properties(mut self, properties: HashMap) -> Self { + self.properties.extend(properties); + self + } + + pub fn custom_property(mut self, key: &str, value: &str) -> Self { + self.custom_properties + .insert(key.to_string(), value.to_string()); + self + } + + pub fn custom_properties(mut self, custom_properties: HashMap) -> Self { + self.custom_properties.extend(custom_properties); + self + } + + pub fn partitioned_by(mut self, partition_keys: Vec) -> Self { + self.partition_keys = partition_keys; + self + } + + pub fn distributed_by(mut self, bucket_count: Option, bucket_keys: Vec) -> Self { + self.table_distribution = Some(TableDistribution { + bucket_count, + bucket_keys, + }); + self + } + + pub fn comment(mut self, comment: &str) -> Self { + self.comment = Some(comment.to_string()); + self + } + + pub fn build(self) -> Result { + let schema = self.schema.expect("Schema must be set"); + let table_distribution = TableDescriptor::normalize_distribution( + &schema, + &self.partition_keys, + self.table_distribution, + )?; + Ok(TableDescriptor { + schema, + comment: self.comment, + partition_keys: self.partition_keys, + table_distribution, + properties: self.properties, + custom_properties: self.custom_properties, + }) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct TableDescriptor { + schema: Schema, + comment: Option, + partition_keys: Vec, + table_distribution: Option, + properties: HashMap, + custom_properties: HashMap, +} + +impl TableDescriptor { + pub fn builder() -> TableDescriptorBuilder { + TableDescriptorBuilder::new() + } + + pub fn schema(&self) -> &Schema { + &self.schema + } + + pub fn bucket_keys(&self) -> Vec<&str> { + self.table_distribution + .as_ref() + .map(|td| td.bucket_keys.iter().map(|s| s.as_str()).collect()) + .unwrap_or_default() + } + + pub fn is_default_bucket_key(&self) -> Result { + if self.schema.primary_key().is_some() { + Ok(self.bucket_keys() + == Self::default_bucket_key_of_primary_key_table( + self.schema(), + &self.partition_keys, + )? + .iter() + .map(|s| s.as_str()) + .collect::>()) + } else { + Ok(self.bucket_keys().is_empty()) + } + } + + pub fn is_partitioned(&self) -> bool { + !self.partition_keys.is_empty() + } + + pub fn has_primary_key(&self) -> bool { + self.schema.primary_key().is_some() + } + + pub fn partition_keys(&self) -> &[String] { + &self.partition_keys + } + + pub fn table_distribution(&self) -> Option<&TableDistribution> { + self.table_distribution.as_ref() + } + + pub fn properties(&self) -> &HashMap { + &self.properties + } + + pub fn custom_properties(&self) -> &HashMap { + &self.custom_properties + } + + pub fn replication_factor(&self) -> Result { + self.properties + .get("table.replication.factor") + .ok_or(InvalidTableError( + "Replication factor is not set".to_string(), + ))? + .parse() + .map_err(|_e| { + InvalidTableError("Replication factor can't be convert into int".to_string()) + }) + } + + pub fn with_properties(&self, new_properties: HashMap) -> Self { + Self { + properties: new_properties, + ..self.clone() + } + } + + pub fn with_replication_factor(&self, new_replication_factor: i32) -> Self { + let mut properties = self.properties.clone(); + properties.insert( + "table.replication.factor".to_string(), + new_replication_factor.to_string(), + ); + self.with_properties(properties) + } + + pub fn with_bucket_count(&self, new_bucket_count: i32) -> Self { + Self { + table_distribution: Some(TableDistribution { + bucket_count: Some(new_bucket_count), + bucket_keys: self + .table_distribution + .as_ref() + .map(|td| td.bucket_keys.clone()) + .unwrap_or_default(), + }), + ..self.clone() + } + } + + pub fn comment(&self) -> Option<&str> { + self.comment.as_deref() + } + + fn default_bucket_key_of_primary_key_table( + schema: &Schema, + partition_keys: &[String], + ) -> Result> { + let mut bucket_keys = schema + .primary_key() + .expect("Primary key must be set") + .column_names() + .to_vec(); + + bucket_keys.retain(|k| !partition_keys.contains(k)); + + if bucket_keys.is_empty() { + return Err(InvalidTableError(format!( + "Primary Key constraint {:?} should not be same with partition fields {:?}.", + schema.primary_key().unwrap().column_names(), + partition_keys + ))); + } + + Ok(bucket_keys) + } + + fn normalize_distribution( + schema: &Schema, + partition_keys: &[String], + origin_distribution: Option, + ) -> Result> { + if let Some(distribution) = origin_distribution { + if distribution + .bucket_keys + .iter() + .any(|k| partition_keys.contains(k)) + { + return Err(InvalidTableError(format!( + "Bucket key {:?} shouldn't include any column in partition keys {:?}.", + distribution.bucket_keys, partition_keys + ))); + } + + return if let Some(pk) = schema.primary_key() { + if distribution.bucket_keys.is_empty() { + Ok(Some(TableDistribution { + bucket_count: distribution.bucket_count, + bucket_keys: Self::default_bucket_key_of_primary_key_table( + schema, + partition_keys, + )?, + })) + } else { + let pk_columns: HashSet<_> = pk.column_names().iter().collect(); + if !distribution + .bucket_keys + .iter() + .all(|k| pk_columns.contains(k)) + { + return Err(InvalidTableError(format!( + "Bucket keys must be a subset of primary keys excluding partition keys for primary-key tables. \ + The primary keys are {:?}, the partition keys are {:?}, but the user-defined bucket keys are {:?}.", + pk.column_names(), + partition_keys, + distribution.bucket_keys + ))); + } + Ok(Some(distribution)) + } + } else { + Ok(Some(distribution)) + }; + } else if schema.primary_key().is_some() { + return Ok(Some(TableDistribution { + bucket_count: None, + bucket_keys: Self::default_bucket_key_of_primary_key_table(schema, partition_keys)?, + })); + } + + Ok(None) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum LogFormat { + ARROW, + INDEXED, +} + +impl Display for LogFormat { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + LogFormat::ARROW => { + write!(f, "ARROW")?; + } + LogFormat::INDEXED => { + write!(f, "INDEXED")?; + } + } + Ok(()) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum KvFormat { + INDEXED, + COMPACTED, +} + +impl Display for KvFormat { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + KvFormat::COMPACTED => write!(f, "COMPACTED")?, + KvFormat::INDEXED => write!(f, "INDEXED")?, + } + Ok(()) + } +} + +#[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize)] +pub struct TablePath { + database: String, + table: String, +} + +impl Display for TablePath { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{}.{}", self.database, self.table) + } +} + +impl TablePath { + pub fn new(db: String, tbl: String) -> Self { + TablePath { + database: db, + table: tbl, + } + } + + #[inline] + pub fn database(&self) -> &str { + &self.database + } + + #[inline] + pub fn table(&self) -> &str { + &self.table + } +} + +#[derive(Debug, Clone)] +pub struct TableInfo { + pub table_path: TablePath, + pub table_id: i64, + pub schema_id: i32, + pub schema: Schema, + pub row_type: DataType, + pub primary_keys: Vec, + pub physical_primary_keys: Vec, + pub bucket_keys: Vec, + pub partition_keys: Vec, + pub num_buckets: i32, + pub properties: HashMap, + pub table_config: TableConfig, + pub custom_properties: HashMap, + pub comment: Option, + pub created_time: i64, + pub modified_time: i64, +} + +impl TableInfo { + pub fn row_type(&self) -> &RowType { + match &self.row_type { + DataType::Row(row_type) => row_type, + _ => panic!("should be a row type"), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct TableConfig { + pub properties: HashMap, +} + +impl TableConfig { + pub fn from_properties(properties: HashMap) -> Self { + TableConfig { properties } + } +} + +impl TableInfo { + pub fn of( + table_path: TablePath, + table_id: i64, + schema_id: i32, + table_descriptor: TableDescriptor, + created_time: i64, + modified_time: i64, + ) -> TableInfo { + let TableDescriptor { + schema, + table_distribution, + comment, + partition_keys, + properties, + custom_properties, + } = table_descriptor; + let TableDistribution { + bucket_count, + bucket_keys, + } = table_distribution.unwrap(); + TableInfo::new( + table_path, + table_id, + schema_id, + schema, + bucket_keys, + partition_keys, + bucket_count.unwrap(), + properties, + custom_properties, + comment, + created_time, + modified_time, + ) + } + + #[allow(clippy::too_many_arguments)] + pub fn new( + table_path: TablePath, + table_id: i64, + schema_id: i32, + schema: Schema, + bucket_keys: Vec, + partition_keys: Vec, + num_buckets: i32, + properties: HashMap, + custom_properties: HashMap, + comment: Option, + created_time: i64, + modified_time: i64, + ) -> Self { + let row_type = schema.row_type.clone(); + let primary_keys: Vec = schema + .primary_key_column_names() + .iter() + .map(|col| (*col).to_string()) + .collect(); + let physical_primary_keys = + Self::generate_physical_primary_key(&primary_keys, &partition_keys); + let table_config = TableConfig::from_properties(properties.clone()); + + TableInfo { + table_path, + table_id, + schema_id, + schema, + row_type, + primary_keys, + physical_primary_keys, + bucket_keys, + partition_keys, + num_buckets, + properties, + table_config, + custom_properties, + comment, + created_time, + modified_time, + } + } + + pub fn get_table_path(&self) -> &TablePath { + &self.table_path + } + + pub fn get_table_id(&self) -> i64 { + self.table_id + } + + pub fn get_schema_id(&self) -> i32 { + self.schema_id + } + + pub fn get_schema(&self) -> &Schema { + &self.schema + } + + pub fn get_row_type(&self) -> &DataType { + &self.row_type + } + + pub fn has_primary_key(&self) -> bool { + !self.primary_keys.is_empty() + } + + pub fn get_primary_keys(&self) -> &Vec { + &self.primary_keys + } + + pub fn get_physical_primary_keys(&self) -> &[String] { + &self.physical_primary_keys + } + + pub fn has_bucket_key(&self) -> bool { + !self.bucket_keys.is_empty() + } + + pub fn is_default_bucket_key(&self) -> bool { + if self.has_primary_key() { + self.bucket_keys == self.physical_primary_keys + } else { + self.bucket_keys.is_empty() + } + } + + pub fn get_bucket_keys(&self) -> &[String] { + &self.bucket_keys + } + + pub fn is_partitioned(&self) -> bool { + !self.partition_keys.is_empty() + } + + pub fn is_auto_partitioned(&self) -> bool { + self.is_partitioned() && todo!() + } + + pub fn get_partition_keys(&self) -> &[String] { + &self.partition_keys + } + + pub fn get_num_buckets(&self) -> i32 { + self.num_buckets + } + + pub fn get_properties(&self) -> &HashMap { + &self.properties + } + + pub fn get_table_config(&self) -> &TableConfig { + &self.table_config + } + + pub fn get_custom_properties(&self) -> &HashMap { + &self.custom_properties + } + + pub fn get_comment(&self) -> Option<&str> { + self.comment.as_deref() + } + + pub fn get_created_time(&self) -> i64 { + self.created_time + } + + pub fn get_modified_time(&self) -> i64 { + self.modified_time + } + + pub fn to_table_descriptor(&self) -> Result { + let mut builder = TableDescriptor::builder() + .schema(self.schema.clone()) + .partitioned_by(self.partition_keys.clone()) + .distributed_by(Some(self.num_buckets), self.bucket_keys.clone()) + .properties(self.properties.clone()) + .custom_properties(self.custom_properties.clone()); + + if let Some(comment) = &self.comment { + builder = builder.comment(&comment.clone()); + } + + builder.build() + } + + fn generate_physical_primary_key( + primary_keys: &[String], + partition_keys: &[String], + ) -> Vec { + primary_keys + .iter() + .filter(|pk| !partition_keys.contains(*pk)) + .cloned() + .collect() + } +} + +impl fmt::Display for TableInfo { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "TableInfo{{ table_path={:?}, table_id={}, schema_id={}, schema={:?}, physical_primary_keys={:?}, bucket_keys={:?}, partition_keys={:?}, num_buckets={}, properties={:?}, custom_properties={:?}, comment={:?}, created_time={}, modified_time={} }}", + self.table_path, + self.table_id, + self.schema_id, + self.schema, + self.physical_primary_keys, + self.bucket_keys, + self.partition_keys, + self.num_buckets, + self.properties, + self.custom_properties, + self.comment, + self.created_time, + self.modified_time + ) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, Hash, PartialEq, Eq)] +pub struct TableBucket { + table_id: i64, + partition_id: Option, + bucket: i32, +} + +impl TableBucket { + pub fn new(table_id: i64, bucket: i32) -> Self { + TableBucket { + table_id, + partition_id: None, + bucket, + } + } + + pub fn table_id(&self) -> i64 { + self.table_id + } + + pub fn bucket_id(&self) -> i32 { + self.bucket + } + + pub fn partition_id(&self) -> Option { + self.partition_id + } +} diff --git a/fluss-rust/crates/fluss/src/proto/fluss_api.proto b/fluss-rust/crates/fluss/src/proto/fluss_api.proto new file mode 100644 index 0000000000..195b8f824c --- /dev/null +++ b/fluss-rust/crates/fluss/src/proto/fluss_api.proto @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto2"; + +package proto; + +// metadata request and response, request send from client to each server. +message MetadataRequest { + repeated PbTablePath table_path = 1; + repeated PbPhysicalTablePath partitions_path = 2; + + // note: currently, we assume the partition ids must belong to the table_paths in the + // metadata request + // todo: we won't need the assumption after we introduce metadata cache in server + repeated int64 partitions_id = 3 [packed = true]; +} + +message MetadataResponse { + optional PbServerNode coordinator_server = 1; + repeated PbServerNode tablet_servers = 2; + repeated PbTableMetadata table_metadata = 3; + repeated PbPartitionMetadata partition_metadata = 4; +} + +// produce log request and response +message ProduceLogRequest { + required int32 acks = 1; + required int64 table_id = 2; + required int32 timeout_ms = 3; + repeated PbProduceLogReqForBucket buckets_req = 4; +} + + +message ProduceLogResponse { + repeated PbProduceLogRespForBucket buckets_resp = 1; +} + + +// --------------- Inner classes ---------------- +message PbTablePath { + required string database_name = 1; + required string table_name = 2; +} + +message PbPhysicalTablePath { + required string database_name = 1; + required string table_name = 2; + optional string partition_name = 3; +} + +// For MetadataResponse, host and port are still used for all versions. +// For UpdateMetadataRequest, +// * versions <= 0.6: host and port are used. +// * versions >= 0.7: listeners is used to replace host and port. +message PbServerNode { + required int32 node_id = 1; + required string host = 2; + required int32 port = 3; + optional string listeners = 4; +} + +message PbTableMetadata { + required PbTablePath table_path = 1; + required int64 table_id = 2; + required int32 schema_id = 3; + required bytes table_json = 4; + repeated PbBucketMetadata bucket_metadata = 5; + required int64 created_time = 6; + required int64 modified_time = 7; +} + +message PbPartitionMetadata { + required int64 table_id = 1; + // the partition name and id for the partition + required string partition_name = 2; + required int64 partition_id = 3; + repeated PbBucketMetadata bucket_metadata = 4; +} + +message PbBucketMetadata { + required int32 bucket_id = 1; + // optional as some time the leader may not elected yet + optional int32 leader_id = 2; + repeated int32 replica_id = 3 [packed = true]; + // TODO: Add isr here. +} + +message PbProduceLogReqForBucket { + optional int64 partition_id = 1; + required int32 bucket_id = 2; + required bytes records = 3; +} + +message PbProduceLogRespForBucket { + optional int64 partition_id = 1; + required int32 bucket_id = 2; + optional int32 error_code = 3; + optional string error_message = 4; + optional int64 base_offset = 5; +} + +message CreateTableRequest { + required PbTablePath table_path = 1; + required bytes table_json = 2; + required bool ignore_if_exists = 3; +} + +message CreateTableResponse { +} + + +message GetTableInfoRequest { + required PbTablePath table_path = 1; +} + +message GetTableInfoResponse { + required int64 table_id = 1; + required int32 schema_id = 2; + required bytes table_json = 3; + required int64 created_time = 4; + required int64 modified_time = 5; +} + + +// fetch log request and response +message FetchLogRequest { + required int32 follower_server_id = 1; // value -1 indicate the request from client. + required int32 max_bytes = 2; + repeated PbFetchLogReqForTable tables_req = 3; + optional int32 max_wait_ms = 4; + optional int32 min_bytes = 5; +} + +message FetchLogResponse { + repeated PbFetchLogRespForTable tables_resp = 1; +} + +message PbFetchLogReqForTable { + required int64 table_id = 1; + required bool projection_pushdown_enabled = 2; + repeated int32 projected_fields = 3 [packed = true]; + repeated PbFetchLogReqForBucket buckets_req = 4; +} + + +message PbFetchLogReqForBucket { + optional int64 partition_id = 1; + required int32 bucket_id = 2; + // TODO leader epoch + required int64 fetch_offset = 3; + required int32 max_fetch_bytes = 4; +} + + +message PbFetchLogRespForTable { + required int64 table_id = 1; + repeated PbFetchLogRespForBucket buckets_resp = 2; +} +message PbFetchLogRespForBucket { + optional int64 partition_id = 1; + required int32 bucket_id = 2; + optional int32 error_code = 3; + optional string error_message = 4; + optional int64 high_watermark = 5; + optional int64 log_start_offset = 6; // TODO now we don't introduce log start offset, but remain it in protobuf + optional PbRemoteLogFetchInfo remote_log_fetch_info = 7; + optional bytes records = 8; +} + +message PbRemoteLogFetchInfo { + required string remote_log_tablet_dir = 1; + optional string partition_name = 2; + repeated PbRemoteLogSegment remote_log_segments = 3; + optional int32 first_start_pos = 4; +} + +message PbRemoteLogSegment { + required string remote_log_segment_id = 1; + required int64 remote_log_start_offset = 2; + required int64 remote_log_end_offset = 3; + required int32 segment_size_in_bytes = 4; +} \ No newline at end of file diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs new file mode 100644 index 0000000000..2f595d0304 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -0,0 +1,545 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + ArrayBuilder, ArrayRef, BooleanBuilder, Float32Builder, Float64Builder, Int8Builder, + Int16Builder, Int32Builder, Int64Builder, StringBuilder, UInt8Builder, UInt16Builder, + UInt32Builder, UInt64Builder, +}; +use arrow::{ + array::RecordBatch, + ipc::{reader::StreamReader, writer::StreamWriter}, +}; +use arrow_schema::SchemaRef; +use arrow_schema::{DataType as ArrowDataType, Field}; +use byteorder::WriteBytesExt; +use byteorder::{ByteOrder, LittleEndian}; +use crc32c::crc32c; +use parking_lot::Mutex; +use std::{ + io::{Cursor, Write}, + sync::Arc, +}; + +use crate::error::Result; +use crate::metadata::DataType; +use crate::record::{ChangeType, ScanRecord}; +use crate::row::{ColumnarRow, GenericRow}; + +/// const for record batch +pub const BASE_OFFSET_LENGTH: usize = 8; +pub const LENGTH_LENGTH: usize = 4; +pub const MAGIC_LENGTH: usize = 1; +pub const COMMIT_TIMESTAMP_LENGTH: usize = 8; +pub const CRC_LENGTH: usize = 4; +pub const SCHEMA_ID_LENGTH: usize = 2; +pub const ATTRIBUTE_LENGTH: usize = 1; +pub const LAST_OFFSET_DELTA_LENGTH: usize = 4; +pub const WRITE_CLIENT_ID_LENGTH: usize = 8; +pub const BATCH_SEQUENCE_LENGTH: usize = 4; +pub const RECORDS_COUNT_LENGTH: usize = 4; + +pub const BASE_OFFSET_OFFSET: usize = 0; +pub const LENGTH_OFFSET: usize = BASE_OFFSET_OFFSET + BASE_OFFSET_LENGTH; +pub const MAGIC_OFFSET: usize = LENGTH_OFFSET + LENGTH_LENGTH; +pub const COMMIT_TIMESTAMP_OFFSET: usize = MAGIC_OFFSET + MAGIC_LENGTH; +pub const CRC_OFFSET: usize = COMMIT_TIMESTAMP_OFFSET + COMMIT_TIMESTAMP_LENGTH; +pub const SCHEMA_ID_OFFSET: usize = CRC_OFFSET + CRC_LENGTH; +pub const ATTRIBUTES_OFFSET: usize = SCHEMA_ID_OFFSET + SCHEMA_ID_LENGTH; +pub const LAST_OFFSET_DELTA_OFFSET: usize = ATTRIBUTES_OFFSET + ATTRIBUTE_LENGTH; +pub const WRITE_CLIENT_ID_OFFSET: usize = LAST_OFFSET_DELTA_OFFSET + LAST_OFFSET_DELTA_LENGTH; +pub const BATCH_SEQUENCE_OFFSET: usize = WRITE_CLIENT_ID_OFFSET + WRITE_CLIENT_ID_LENGTH; +pub const RECORDS_COUNT_OFFSET: usize = BATCH_SEQUENCE_OFFSET + BATCH_SEQUENCE_LENGTH; +pub const RECORDS_OFFSET: usize = RECORDS_COUNT_OFFSET + RECORDS_COUNT_LENGTH; + +pub const RECORD_BATCH_HEADER_SIZE: usize = RECORDS_OFFSET; +pub const ARROW_CHANGETYPE_OFFSET: usize = RECORD_BATCH_HEADER_SIZE; +pub const LOG_OVERHEAD: usize = LENGTH_OFFSET + LENGTH_LENGTH; + +/// const for record +/// The "magic" values. +#[derive(Debug, Clone, Copy)] +pub enum LogMagicValue { + V0 = 0, +} + +pub const CURRENT_LOG_MAGIC_VALUE: u8 = LogMagicValue::V0 as u8; + +/// Value used if writer ID is not available or non-idempotent. +pub const NO_WRITER_ID: i64 = -1; + +/// Value used if batch sequence is not available. +pub const NO_BATCH_SEQUENCE: i32 = -1; + +pub const BUILDER_DEFAULT_OFFSET: i64 = 0; + +pub const DEFAULT_MAX_RECORD: i32 = 256; + +pub struct MemoryLogRecordsArrowBuilder { + base_log_offset: i64, + schema_id: i32, + magic: u8, + writer_id: i64, + batch_sequence: i32, + table_schema: SchemaRef, + record_count: i32, + arrow_column_builders: Mutex>>, + is_closed: bool, +} + +impl MemoryLogRecordsArrowBuilder { + pub fn new(schema_id: i32, row_type: &DataType) -> Self { + let schema_ref = to_arrow_schema(row_type); + let builders = Mutex::new( + schema_ref + .fields() + .iter() + .map(|field| Self::create_builder(field.data_type())) + .collect(), + ); + MemoryLogRecordsArrowBuilder { + base_log_offset: BUILDER_DEFAULT_OFFSET, + schema_id, + magic: CURRENT_LOG_MAGIC_VALUE, + writer_id: NO_WRITER_ID, + batch_sequence: NO_BATCH_SEQUENCE, + record_count: 0, + table_schema: schema_ref, + arrow_column_builders: builders, + is_closed: false, + } + } + + pub fn append(&mut self, row: &GenericRow) -> Result<()> { + for (idx, value) in row.values.iter().enumerate() { + let mut builder_binding = self.arrow_column_builders.lock(); + let builder = builder_binding.get_mut(idx).unwrap(); + value.append_to(builder.as_mut())?; + } + self.record_count += 1; + // todo: consider write other change type + Ok(()) + } + + pub fn is_full(&self) -> bool { + self.record_count >= DEFAULT_MAX_RECORD + } + + pub fn is_closed(&self) -> bool { + self.is_closed + } + + pub fn close(&mut self) { + self.is_closed = true; + } + + pub fn build(&self) -> Result> { + // serialize arrow batch + let mut arrow_batch_bytes = vec![]; + let mut writer = StreamWriter::try_new(&mut arrow_batch_bytes, &self.table_schema)?; + + let arrays = self + .arrow_column_builders + .lock() + .iter_mut() + .map(|b| b.finish()) + .collect::>(); + let record_batch = RecordBatch::try_new(self.table_schema.clone(), arrays)?; + // get header len + let header = writer.get_ref().len(); + writer.write(&record_batch)?; + // get real arrow batch bytes + let real_arrow_batch_bytes = &arrow_batch_bytes[header..]; + + // now, write batch header and arrow batch + let mut batch_bytes = vec![0u8; RECORD_BATCH_HEADER_SIZE + real_arrow_batch_bytes.len()]; + // write batch header + self.write_batch_header(&mut batch_bytes[..])?; + + // write arrow batch bytes + let mut cursor = Cursor::new(&mut batch_bytes[..]); + cursor.set_position(RECORD_BATCH_HEADER_SIZE as u64); + cursor.write_all(real_arrow_batch_bytes).unwrap(); + + let calcute_crc_bytes = &cursor.get_ref()[SCHEMA_ID_OFFSET..]; + // then update crc + let crc = crc32c(calcute_crc_bytes); + cursor.set_position(CRC_OFFSET as u64); + cursor.write_u32::(crc)?; + + Ok(batch_bytes.to_vec()) + } + + fn write_batch_header(&self, buffer: &mut [u8]) -> Result<()> { + let total_len = buffer.len(); + let mut cursor = Cursor::new(buffer); + cursor.write_i64::(self.base_log_offset)?; + cursor + .write_i32::((total_len - BASE_OFFSET_LENGTH - LENGTH_LENGTH) as i32)?; + cursor.write_u8(self.magic)?; + cursor.write_i64::(0)?; // timestamp placeholder + cursor.write_u32::(0)?; // crc placeholder + cursor.write_i16::(self.schema_id as i16)?; + + // todo: curerntly, always is append only + let append_only = true; + cursor.write_u8(if append_only { 1 } else { 0 })?; + cursor.write_i32::(if self.record_count > 0 { + self.record_count - 1 + } else { + 0 + })?; + + cursor.write_i64::(self.writer_id)?; + cursor.write_i32::(self.batch_sequence)?; + cursor.write_i32::(self.record_count)?; + Ok(()) + } + + fn create_builder(data_type: &arrow_schema::DataType) -> Box { + match data_type { + arrow_schema::DataType::Int8 => Box::new(Int8Builder::new()), + arrow_schema::DataType::Int16 => Box::new(Int16Builder::new()), + arrow_schema::DataType::Int32 => Box::new(Int32Builder::new()), + arrow_schema::DataType::Int64 => Box::new(Int64Builder::new()), + arrow_schema::DataType::UInt8 => Box::new(UInt8Builder::new()), + arrow_schema::DataType::UInt16 => Box::new(UInt16Builder::new()), + arrow_schema::DataType::UInt32 => Box::new(UInt32Builder::new()), + arrow_schema::DataType::UInt64 => Box::new(UInt64Builder::new()), + arrow_schema::DataType::Float32 => Box::new(Float32Builder::new()), + arrow_schema::DataType::Float64 => Box::new(Float64Builder::new()), + arrow_schema::DataType::Boolean => Box::new(BooleanBuilder::new()), + arrow_schema::DataType::Utf8 => Box::new(StringBuilder::new()), + dt => panic!("Unsupported data type: {dt:?}"), + } + } +} + +pub trait ToArrow { + fn append_to(&self, builder: &mut dyn ArrayBuilder) -> Result<()>; +} + +pub struct LogRecordsBatchs<'a> { + data: &'a [u8], + current_pos: usize, + remaining_bytes: usize, +} + +impl<'a> LogRecordsBatchs<'a> { + pub fn new(data: &'a [u8]) -> Self { + let remaining_bytes: usize = data.len(); + Self { + data, + current_pos: 0, + remaining_bytes, + } + } + + pub fn next_batch_size(&self) -> Option { + if self.remaining_bytes < LOG_OVERHEAD { + return None; + } + + let batch_size_bytes = + LittleEndian::read_i32(self.data.get(self.current_pos + LENGTH_OFFSET..).unwrap()); + Some(batch_size_bytes as usize + LOG_OVERHEAD) + } +} + +impl<'a> Iterator for &'a mut LogRecordsBatchs<'a> { + type Item = LogRecordBatch<'a>; + + fn next(&mut self) -> Option { + match self.next_batch_size() { + Some(batch_size) => { + let data_slice = &self.data[self.current_pos..self.current_pos + batch_size]; + let record_batch = LogRecordBatch::new(data_slice); + self.current_pos += batch_size; + self.remaining_bytes -= batch_size; + Some(record_batch) + } + None => None, + } + } +} + +pub struct LogRecordBatch<'a> { + data: &'a [u8], +} + +#[allow(dead_code)] +impl<'a> LogRecordBatch<'a> { + pub fn new(data: &'a [u8]) -> Self { + LogRecordBatch { data } + } + + pub fn magic(&self) -> u8 { + self.data[MAGIC_OFFSET] + } + + pub fn commit_timestamp(&self) -> i64 { + let offset = COMMIT_TIMESTAMP_OFFSET; + LittleEndian::read_i64(&self.data[offset..offset + COMMIT_TIMESTAMP_LENGTH]) + } + + pub fn writer_id(&self) -> i64 { + let offset = WRITE_CLIENT_ID_OFFSET; + LittleEndian::read_i64(&self.data[offset..offset + WRITE_CLIENT_ID_LENGTH]) + } + + pub fn batch_sequence(&self) -> i32 { + let offset = BATCH_SEQUENCE_OFFSET; + LittleEndian::read_i32(&self.data[offset..offset + BATCH_SEQUENCE_LENGTH]) + } + + pub fn ensure_valid(&self) -> Result<()> { + // todo + Ok(()) + } + + pub fn is_valid(&self) -> bool { + self.size_in_bytes() >= RECORD_BATCH_HEADER_SIZE + && self.checksum() == self.compute_checksum() + } + + fn compute_checksum(&self) -> u32 { + let start = SCHEMA_ID_OFFSET; + let end = start + self.data.len(); + crc32c(&self.data[start..end]) + } + + fn attributes(&self) -> u8 { + self.data[ATTRIBUTES_OFFSET] + } + + pub fn next_log_offset(&self) -> i64 { + self.last_log_offset() + 1 + } + + pub fn checksum(&self) -> u32 { + let offset = CRC_OFFSET; + LittleEndian::read_u32(&self.data[offset..offset + CRC_OFFSET]) + } + + pub fn schema_id(&self) -> i16 { + let offset = SCHEMA_ID_OFFSET; + LittleEndian::read_i16(&self.data[offset..offset + SCHEMA_ID_OFFSET]) + } + + pub fn base_log_offset(&self) -> i64 { + let offset = BASE_OFFSET_OFFSET; + LittleEndian::read_i64(&self.data[offset..offset + BASE_OFFSET_LENGTH]) + } + + pub fn last_log_offset(&self) -> i64 { + self.base_log_offset() + self.last_offset_delta() as i64 + } + + fn last_offset_delta(&self) -> i32 { + let offset = LAST_OFFSET_DELTA_OFFSET; + LittleEndian::read_i32(&self.data[offset..offset + LAST_OFFSET_DELTA_LENGTH]) + } + + pub fn size_in_bytes(&self) -> usize { + let offset = LENGTH_OFFSET; + LittleEndian::read_i32(&self.data[offset..offset + LENGTH_LENGTH]) as usize + LOG_OVERHEAD + } + + pub fn record_count(&self) -> i32 { + let offset = RECORDS_COUNT_OFFSET; + LittleEndian::read_i32(&self.data[offset..offset + RECORDS_COUNT_LENGTH]) + } + + pub fn records(&self, read_context: ReadContext) -> LogRecordIterator { + let count = self.record_count(); + if count == 0 { + return LogRecordIterator::empty(); + } + + // get arrow_metadata + let arrow_metadata_bytes = read_context.to_arrow_metadata().unwrap(); + // arrow_batch_data + let data = &self.data[RECORDS_OFFSET..]; + + // need to combine arrow_metadata_bytes + arrow_batch_data + let cursor = Cursor::new([&arrow_metadata_bytes, data].concat()); + let mut stream_reader = StreamReader::try_new(cursor, None).unwrap(); + + let mut record_batch = None; + if let Some(bath) = stream_reader.next() { + record_batch = Some(bath.unwrap()); + } + + if record_batch.is_none() { + return LogRecordIterator::empty(); + } + + let arrow_reader = ArrowReader::new(Arc::new(record_batch.unwrap())); + LogRecordIterator::Arrow(ArrowLogRecordIterator { + reader: arrow_reader, + base_offset: self.base_log_offset(), + timestamp: self.commit_timestamp(), + row_id: 0, + change_type: ChangeType::AppendOnly, + }) + } +} + +pub fn to_arrow_schema(fluss_schema: &DataType) -> SchemaRef { + match &fluss_schema { + DataType::Row(row_type) => { + let fields: Vec = row_type + .fields() + .iter() + .map(|f| { + Field::new( + f.name(), + to_arrow_type(f.data_type()), + f.data_type().is_nullable(), + ) + }) + .collect(); + + SchemaRef::new(arrow_schema::Schema::new(fields)) + } + _ => { + panic!("must be row data tyoe.") + } + } +} + +pub fn to_arrow_type(fluss_type: &DataType) -> ArrowDataType { + match fluss_type { + DataType::Boolean(_) => ArrowDataType::Boolean, + DataType::TinyInt(_) => ArrowDataType::Int8, + DataType::SmallInt(_) => ArrowDataType::Int16, + DataType::BigInt(_) => ArrowDataType::Int64, + DataType::Int(_) => ArrowDataType::Int32, + DataType::Float(_) => ArrowDataType::Float32, + DataType::Double(_) => ArrowDataType::Float64, + DataType::Char(_) => ArrowDataType::Utf8, + DataType::String(_) => ArrowDataType::Utf8, + DataType::Decimal(_) => todo!(), + DataType::Date(_) => ArrowDataType::Date32, + DataType::Time(_) => todo!(), + DataType::Timestamp(_) => todo!(), + DataType::TimestampLTz(_) => todo!(), + DataType::Bytes(_) => todo!(), + DataType::Binary(_) => todo!(), + DataType::Array(_data_type) => todo!(), + DataType::Map(_data_type) => todo!(), + DataType::Row(_data_fields) => todo!(), + } +} + +pub struct ReadContext { + arrow_schema: SchemaRef, +} + +impl ReadContext { + pub fn new(arrow_schema: SchemaRef) -> ReadContext { + ReadContext { arrow_schema } + } + + pub fn to_arrow_metadata(&self) -> Result> { + let mut arrow_schema_bytes = vec![]; + let _writer = StreamWriter::try_new(&mut arrow_schema_bytes, &self.arrow_schema)?; + Ok(arrow_schema_bytes) + } +} + +pub enum LogRecordIterator { + Empty, + Arrow(ArrowLogRecordIterator), +} + +impl LogRecordIterator { + pub fn empty() -> Self { + LogRecordIterator::Empty + } +} + +impl Iterator for LogRecordIterator { + type Item = ScanRecord; + + fn next(&mut self) -> Option { + match self { + LogRecordIterator::Empty => None, + LogRecordIterator::Arrow(iter) => iter.next(), + } + } +} + +pub struct ArrowLogRecordIterator { + reader: ArrowReader, + base_offset: i64, + timestamp: i64, + row_id: usize, + change_type: ChangeType, +} + +#[allow(dead_code)] +impl ArrowLogRecordIterator { + fn new(reader: ArrowReader, base_offset: i64, timestamp: i64, change_type: ChangeType) -> Self { + Self { + reader, + base_offset, + timestamp, + row_id: 0, + change_type, + } + } +} + +impl Iterator for ArrowLogRecordIterator { + type Item = ScanRecord; + + fn next(&mut self) -> Option { + if self.row_id >= self.reader.row_count() { + return None; + } + + let columnar_row = self.reader.read(self.row_id); + let scan_record = ScanRecord::new( + columnar_row, + self.base_offset + self.row_id as i64, + self.timestamp, + self.change_type, + ); + self.row_id += 1; + Some(scan_record) + } +} + +pub struct ArrowReader { + record_batch: Arc, +} + +impl ArrowReader { + pub fn new(record_batch: Arc) -> Self { + ArrowReader { record_batch } + } + + pub fn row_count(&self) -> usize { + self.record_batch.num_rows() + } + + pub fn read(&self, row_id: usize) -> ColumnarRow { + ColumnarRow::new_with_row_id(self.record_batch.clone(), row_id) + } +} +pub struct MyVec(pub StreamReader); diff --git a/fluss-rust/crates/fluss/src/record/error.rs b/fluss-rust/crates/fluss/src/record/error.rs new file mode 100644 index 0000000000..22704a0cdf --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/error.rs @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::io; +use thiserror::Error; + +#[derive(Error, Debug)] +#[non_exhaustive] +#[allow(dead_code)] +pub enum Error { + #[error(transparent)] + Io(#[from] io::Error), +} diff --git a/fluss-rust/crates/fluss/src/record/mod.rs b/fluss-rust/crates/fluss/src/record/mod.rs new file mode 100644 index 0000000000..d7872055b9 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/mod.rs @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::metadata::TableBucket; +use crate::row::ColumnarRow; +use core::fmt; +use std::collections::HashMap; + +mod arrow; +mod error; + +pub use arrow::*; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum ChangeType { + /// Append-only operation + AppendOnly, + /// Insert operation + Insert, + /// Update operation containing the previous content of the updated row + UpdateBefore, + /// Update operation containing the new content of the updated row + UpdateAfter, + /// Delete operation + Delete, +} + +impl ChangeType { + /// Returns a short string representation of this ChangeType + pub fn short_string(&self) -> &'static str { + match self { + ChangeType::AppendOnly => "+A", + ChangeType::Insert => "+I", + ChangeType::UpdateBefore => "-U", + ChangeType::UpdateAfter => "+U", + ChangeType::Delete => "-D", + } + } + + /// Returns the byte value representation used for serialization + pub fn to_byte_value(&self) -> u8 { + match self { + ChangeType::AppendOnly => 0, + ChangeType::Insert => 1, + ChangeType::UpdateBefore => 2, + ChangeType::UpdateAfter => 3, + ChangeType::Delete => 4, + } + } + + /// Creates a ChangeType from its byte value representation + /// + /// # Errors + /// Returns an error if the byte value doesn't correspond to any ChangeType + pub fn from_byte_value(value: u8) -> Result { + match value { + 0 => Ok(ChangeType::AppendOnly), + 1 => Ok(ChangeType::Insert), + 2 => Ok(ChangeType::UpdateBefore), + 3 => Ok(ChangeType::UpdateAfter), + 4 => Ok(ChangeType::Delete), + _ => Err(format!("Unsupported byte value '{value}' for change type")), + } + } +} + +impl fmt::Display for ChangeType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.short_string()) + } +} + +pub struct ScanRecord { + pub row: ColumnarRow, + offset: i64, + timestamp: i64, + change_type: ChangeType, +} + +impl ScanRecord { + const INVALID: i64 = -1; + + pub fn new_default(row: ColumnarRow) -> Self { + ScanRecord { + row, + offset: Self::INVALID, + timestamp: Self::INVALID, + change_type: ChangeType::Insert, + } + } + + pub fn new(row: ColumnarRow, offset: i64, timestamp: i64, change_type: ChangeType) -> Self { + ScanRecord { + row, + offset, + timestamp, + change_type, + } + } + + pub fn row(&self) -> &ColumnarRow { + &self.row + } + + /// Returns the position in the log + pub fn offset(&self) -> i64 { + self.offset + } + + /// Returns the timestamp + pub fn timestamp(&self) -> i64 { + self.timestamp + } + + /// Returns the change type + pub fn change_type(&self) -> &ChangeType { + &self.change_type + } +} + +pub struct ScanRecords { + records: HashMap>, +} + +impl ScanRecords { + pub fn empty() -> Self { + Self { + records: HashMap::new(), + } + } + + pub fn new(records: HashMap>) -> Self { + Self { records } + } + + pub fn records(&self, scan_bucket: &TableBucket) -> &[ScanRecord] { + self.records.get(scan_bucket).map_or(&[], |records| records) + } + + pub fn count(&self) -> usize { + self.records.values().map(|v| v.len()).sum() + } + + pub fn is_empty(&self) -> bool { + self.records.is_empty() + } +} + +impl IntoIterator for ScanRecords { + type Item = ScanRecord; + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.records + .into_values() + .flatten() + .collect::>() + .into_iter() + } +} diff --git a/fluss-rust/crates/fluss/src/row/column.rs b/fluss-rust/crates/fluss/src/row/column.rs new file mode 100644 index 0000000000..44ca640b51 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/column.rs @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::row::InternalRow; +use arrow::array::{ + AsArray, BinaryArray, FixedSizeBinaryArray, Float32Array, Float64Array, Int8Array, Int16Array, + Int32Array, Int64Array, RecordBatch, StringArray, +}; +use std::sync::Arc; + +pub struct ColumnarRow { + record_batch: Arc, + row_id: usize, +} + +impl ColumnarRow { + pub fn new(batch: Arc) -> Self { + ColumnarRow { + record_batch: batch, + row_id: 0, + } + } + + pub fn new_with_row_id(bach: Arc, row_id: usize) -> Self { + ColumnarRow { + record_batch: bach, + row_id, + } + } + + pub fn set_row_id(&mut self, row_id: usize) { + self.row_id = row_id + } +} + +impl InternalRow for ColumnarRow { + fn get_field_count(&self) -> usize { + self.record_batch.num_columns() + } + + fn is_null_at(&self, pos: usize) -> bool { + self.record_batch.column(pos).is_null(self.row_id) + } + + fn get_boolean(&self, pos: usize) -> bool { + self.record_batch + .column(pos) + .as_boolean() + .value(self.row_id) + } + + fn get_byte(&self, pos: usize) -> i8 { + self.record_batch + .column(pos) + .as_any() + .downcast_ref::() + .expect("Expect byte array") + .value(self.row_id) + } + + fn get_short(&self, pos: usize) -> i16 { + self.record_batch + .column(pos) + .as_any() + .downcast_ref::() + .expect("Expect short array") + .value(self.row_id) + } + + fn get_int(&self, pos: usize) -> i32 { + self.record_batch + .column(pos) + .as_any() + .downcast_ref::() + .expect("Expect int array") + .value(self.row_id) + } + + fn get_long(&self, pos: usize) -> i64 { + self.record_batch + .column(pos) + .as_any() + .downcast_ref::() + .expect("Expect long array") + .value(self.row_id) + } + + fn get_float(&self, pos: usize) -> f32 { + self.record_batch + .column(pos) + .as_any() + .downcast_ref::() + .expect("Expect float32 array") + .value(self.row_id) + } + + fn get_double(&self, pos: usize) -> f64 { + self.record_batch + .column(pos) + .as_any() + .downcast_ref::() + .expect("Expect float64 array") + .value(self.row_id) + } + + fn get_char(&self, pos: usize, length: usize) -> String { + let array = self + .record_batch + .column(pos) + .as_any() + .downcast_ref::() + .expect("Expected fixed-size binary array for char type"); + + let bytes = array.value(self.row_id); + if bytes.len() != length { + panic!( + "Length mismatch for fixed-size char: expected {}, got {}", + length, + bytes.len() + ); + } + + String::from_utf8(bytes.to_vec()) + .unwrap_or_else(|_| String::from_utf8_lossy(bytes).into_owned()) + } + + fn get_string(&self, pos: usize) -> &str { + self.record_batch + .column(pos) + .as_any() + .downcast_ref::() + .expect("Expected String array.") + .value(self.row_id) + } + + fn get_binary(&self, pos: usize, _length: usize) -> Vec { + self.record_batch + .column(pos) + .as_any() + .downcast_ref::() + .expect("Expected binary array.") + .value(self.row_id) + .to_vec() + } + + fn get_bytes(&self, pos: usize) -> Vec { + self.record_batch + .column(pos) + .as_any() + .downcast_ref::() + .expect("Expected bytes array.") + .value(self.row_id) + .to_vec() + } +} diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs new file mode 100644 index 0000000000..3c65a7d4ee --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -0,0 +1,287 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use chrono::Datelike; + +use crate::error::Error::RowConvertError; +use crate::error::Result; +use arrow::array::{ArrayBuilder, Int8Builder, Int16Builder, Int32Builder, StringBuilder}; +use chrono::NaiveDate; +use ordered_float::OrderedFloat; +use parse_display::Display; +use ref_cast::RefCast; +use rust_decimal::Decimal; +use serde::{Deserialize, Serialize}; +use std::fmt; +use std::ops::Deref; + +#[allow(dead_code)] +const THIRTY_YEARS_MICROSECONDS: i64 = 946_684_800_000_000; + +pub const UNIX_EPOCH_DAYS: i32 = 719_163; + +#[derive(Debug, Clone, Display, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] +pub enum Datum<'a> { + #[display("null")] + Null, + #[display("{0}")] + Bool(bool), + #[display("{0}")] + Int16(i16), + #[display("{0}")] + Int32(i32), + #[display("{0}")] + Int64(i64), + #[display("{0}")] + Float64(F64), + #[display("'{0}'")] + String(&'a str), + #[display("{0}")] + Blob(Blob), + #[display("{0}")] + Decimal(Decimal), + #[display("{0}")] + Date(Date), + #[display("{0}")] + Timestamp(Timestamp), + #[display("{0}")] + TimestampTz(TimestampLtz), +} + +impl Datum<'_> { + pub fn is_null(&self) -> bool { + matches!(self, Datum::Null) + } + + pub fn as_str(&self) -> &str { + match self { + Self::String(s) => s, + _ => panic!("not a string: {self:?}"), + } + } +} + +// ----------- implement from +impl<'a> From for Datum<'a> { + #[inline] + fn from(i: i32) -> Datum<'a> { + Datum::Int32(i) + } +} + +impl<'a> From<&'a str> for Datum<'a> { + #[inline] + fn from(s: &'a str) -> Datum<'a> { + Datum::String(s) + } +} + +impl From> for Datum<'_> { + fn from(_: Option<&()>) -> Self { + Self::Null + } +} + +impl TryFrom<&Datum<'_>> for i32 { + type Error = (); + + #[inline] + fn try_from(from: &Datum) -> std::result::Result { + match from { + Datum::Int32(i) => Ok(*i), + _ => Err(()), + } + } +} + +impl<'a> TryFrom<&Datum<'a>> for &'a str { + type Error = (); + + #[inline] + fn try_from(from: &Datum<'a>) -> std::result::Result { + match from { + Datum::String(i) => Ok(*i), + _ => Err(()), + } + } +} + +pub trait ToArrow { + fn append_to(&self, builder: &mut dyn ArrayBuilder) -> Result<()>; +} + +impl Datum<'_> { + pub fn append_to(&self, builder: &mut dyn ArrayBuilder) -> Result<()> { + match self { + Datum::Null => { + todo!() + } + Datum::Bool(_v) => { + todo!() + } + Datum::Int16(_v) => { + todo!() + } + Datum::Int32(v) => { + v.append_to(builder)?; + } + Datum::Int64(_v) => { + todo!() + } + Datum::Float64(_v) => { + todo!() + } + Datum::String(v) => { + v.append_to(builder)?; + } + Datum::Blob(_v) => { + todo!() + } + Datum::Decimal(_v) => { + todo!() + } + Datum::Date(_v) => { + todo!() + } + Datum::Timestamp(_v) => { + todo!() + } + Datum::TimestampTz(_v) => { + todo!() + } + } + Ok(()) + } +} + +macro_rules! impl_to_arrow { + ($ty:ty, $variant:ident) => { + impl ToArrow for $ty { + fn append_to(&self, builder: &mut dyn ArrayBuilder) -> Result<()> { + if let Some(b) = builder.as_any_mut().downcast_mut::<$variant>() { + b.append_value(*self); + Ok(()) + } else { + Err(RowConvertError(format!( + "Cannot cast {} to {} builder", + stringify!($ty), + stringify!($variant) + ))) + } + } + } + }; +} + +impl_to_arrow!(i8, Int8Builder); +impl_to_arrow!(i16, Int16Builder); +impl_to_arrow!(i32, Int32Builder); +impl_to_arrow!(&str, StringBuilder); + +#[allow(dead_code)] +pub type F32 = OrderedFloat; +pub type F64 = OrderedFloat; +#[allow(dead_code)] +pub type Str = Box; + +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Serialize, Deserialize, Default)] +pub struct Blob(Box<[u8]>); + +impl Deref for Blob { + type Target = BlobRef; + + fn deref(&self) -> &Self::Target { + BlobRef::new(&self.0) + } +} + +impl BlobRef { + pub fn new(bytes: &[u8]) -> &Self { + // SAFETY: `&BlobRef` and `&[u8]` have the same layout. + BlobRef::ref_cast(bytes) + } +} + +/// A slice of a blob. +#[repr(transparent)] +#[derive(PartialEq, Eq, PartialOrd, Ord, RefCast, Hash)] +pub struct BlobRef([u8]); + +impl fmt::Debug for Blob { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?}", self.as_ref()) + } +} + +impl fmt::Display for Blob { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?}", self.as_ref()) + } +} + +impl AsRef<[u8]> for BlobRef { + fn as_ref(&self) -> &[u8] { + &self.0 + } +} + +impl Deref for BlobRef { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[derive(PartialOrd, Ord, Display, PartialEq, Eq, Debug, Copy, Clone, Default, Hash, Serialize)] +pub struct Date(i32); + +#[derive(PartialOrd, Ord, Display, PartialEq, Eq, Debug, Copy, Clone, Default, Hash, Serialize)] +pub struct Timestamp(i64); + +#[derive(PartialOrd, Ord, Display, PartialEq, Eq, Debug, Copy, Clone, Default, Hash, Serialize)] +pub struct TimestampLtz(i64); + +impl From> for Blob { + fn from(vec: Vec) -> Self { + Blob(vec.into()) + } +} + +impl Date { + pub const fn new(inner: i32) -> Self { + Date(inner) + } + + /// Get the inner value of date type + pub fn get_inner(&self) -> i32 { + self.0 + } + + pub fn year(&self) -> i32 { + let date = NaiveDate::from_num_days_from_ce_opt(self.0 + UNIX_EPOCH_DAYS).unwrap(); + date.year() + } + pub fn month(&self) -> i32 { + let date = NaiveDate::from_num_days_from_ce_opt(self.0 + UNIX_EPOCH_DAYS).unwrap(); + date.month() as i32 + } + pub fn day(&self) -> i32 { + let date = NaiveDate::from_num_days_from_ce_opt(self.0 + UNIX_EPOCH_DAYS).unwrap(); + date.day() as i32 + } +} diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs new file mode 100644 index 0000000000..ead6ff0067 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::row::datum::Datum; + +mod column; + +mod datum; + +pub use column::*; + +pub trait InternalRow { + /// Returns the number of fields in this row + fn get_field_count(&self) -> usize; + + /// Returns true if the element is null at the given position + fn is_null_at(&self, pos: usize) -> bool; + + /// Returns the boolean value at the given position + fn get_boolean(&self, pos: usize) -> bool; + + /// Returns the byte value at the given position + fn get_byte(&self, pos: usize) -> i8; + + /// Returns the short value at the given position + fn get_short(&self, pos: usize) -> i16; + + /// Returns the integer value at the given position + fn get_int(&self, pos: usize) -> i32; + + /// Returns the long value at the given position + fn get_long(&self, pos: usize) -> i64; + + /// Returns the float value at the given position + fn get_float(&self, pos: usize) -> f32; + + /// Returns the double value at the given position + fn get_double(&self, pos: usize) -> f64; + + /// Returns the string value at the given position with fixed length + fn get_char(&self, pos: usize, length: usize) -> String; + + /// Returns the string value at the given position + fn get_string(&self, pos: usize) -> &str; + + // /// Returns the decimal value at the given position + // fn get_decimal(&self, pos: usize, precision: usize, scale: usize) -> Decimal; + + // /// Returns the timestamp value at the given position + // fn get_timestamp_ntz(&self, pos: usize, precision: usize) -> TimestampNtz; + + // /// Returns the timestamp value at the given position + // fn get_timestamp_ltz(&self, pos: usize, precision: usize) -> TimestampLtz; + + /// Returns the binary value at the given position with fixed length + fn get_binary(&self, pos: usize, length: usize) -> Vec; + + /// Returns the binary value at the given position + fn get_bytes(&self, pos: usize) -> Vec; +} + +pub struct GenericRow<'a> { + pub values: Vec>, +} + +impl<'a> InternalRow for GenericRow<'a> { + fn get_field_count(&self) -> usize { + self.values.len() + } + + fn is_null_at(&self, _pos: usize) -> bool { + false + } + + fn get_boolean(&self, _pos: usize) -> bool { + todo!() + } + + fn get_byte(&self, _pos: usize) -> i8 { + todo!() + } + + fn get_short(&self, _pos: usize) -> i16 { + todo!() + } + + fn get_int(&self, pos: usize) -> i32 { + self.values.get(pos).unwrap().try_into().unwrap() + } + + fn get_long(&self, _pos: usize) -> i64 { + todo!() + } + + fn get_float(&self, _pos: usize) -> f32 { + todo!() + } + + fn get_double(&self, _pos: usize) -> f64 { + todo!() + } + + fn get_char(&self, _pos: usize, _length: usize) -> String { + todo!() + } + + fn get_string(&self, pos: usize) -> &str { + self.values.get(pos).unwrap().try_into().unwrap() + } + + fn get_binary(&self, _pos: usize, _length: usize) -> Vec { + todo!() + } + + fn get_bytes(&self, _pos: usize) -> Vec { + todo!() + } +} + +impl<'a> Default for GenericRow<'a> { + fn default() -> Self { + Self::new() + } +} + +impl<'a> GenericRow<'a> { + pub fn new() -> GenericRow<'a> { + GenericRow { values: vec![] } + } + + pub fn set_field(&mut self, pos: usize, value: impl Into>) { + self.values.insert(pos, value.into()); + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/api_key.rs b/fluss-rust/crates/fluss/src/rpc/api_key.rs new file mode 100644 index 0000000000..49282084ef --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/api_key.rs @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::rpc::api_key::ApiKey::Unknown; + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)] +pub enum ApiKey { + CreateTable, + ProduceLog, + FetchLog, + MetaData, + GetTable, + Unknown(i16), +} + +impl From for ApiKey { + fn from(key: i16) -> Self { + match key { + 1005 => ApiKey::CreateTable, + 1014 => ApiKey::ProduceLog, + 1015 => ApiKey::FetchLog, + 1012 => ApiKey::MetaData, + 1007 => ApiKey::GetTable, + _ => Unknown(key), + } + } +} + +impl From for i16 { + fn from(key: ApiKey) -> Self { + match key { + ApiKey::CreateTable => 1005, + ApiKey::ProduceLog => 1014, + ApiKey::MetaData => 1012, + ApiKey::GetTable => 1007, + ApiKey::FetchLog => 1015, + Unknown(x) => x, + } + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/api_version.rs b/fluss-rust/crates/fluss/src/rpc/api_version.rs new file mode 100644 index 0000000000..395c45cd0f --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/api_version.rs @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)] +pub struct ApiVersion(pub i16); + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub struct ApiVersionRange { + min: ApiVersion, + max: ApiVersion, +} + +impl std::fmt::Display for ApiVersion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +#[allow(dead_code)] +impl ApiVersionRange { + pub const fn new(min: ApiVersion, max: ApiVersion) -> Self { + assert!(min.0 <= max.0); + + Self { min, max } + } + + pub fn min(&self) -> ApiVersion { + self.min + } + + pub fn max(&self) -> ApiVersion { + self.max + } +} + +impl std::fmt::Display for ApiVersionRange { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}", self.min, self.max) + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/convert.rs b/fluss-rust/crates/fluss/src/rpc/convert.rs new file mode 100644 index 0000000000..6feb7eb8af --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/convert.rs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::cluster::{ServerNode, ServerType}; +use crate::metadata::TablePath; +use crate::proto::{PbServerNode, PbTablePath}; + +pub fn to_table_path(table_path: &TablePath) -> PbTablePath { + PbTablePath { + database_name: table_path.database().to_string(), + table_name: table_path.table().to_string(), + } +} + +pub fn from_pb_server_node(pb_server_node: PbServerNode, server_type: ServerType) -> ServerNode { + ServerNode::new( + pb_server_node.node_id, + pb_server_node.host, + pb_server_node.port as u32, + server_type, + ) +} + +pub fn from_pb_table_path(pb_table_path: &PbTablePath) -> TablePath { + TablePath::new( + pb_table_path.database_name.to_string(), + pb_table_path.table_name.to_string(), + ) +} diff --git a/fluss-rust/crates/fluss/src/rpc/error.rs b/fluss-rust/crates/fluss/src/rpc/error.rs new file mode 100644 index 0000000000..84b20b102e --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/error.rs @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use std::sync::Arc; +use thiserror::Error; + +#[derive(Error, Debug)] +#[non_exhaustive] +pub enum RpcError { + #[error("Cannot write message: {0}")] + WriteMessageError(#[from] crate::rpc::frame::WriteError), + + #[error("Cannot read framed message: {0}")] + ReadMessageError(#[from] crate::rpc::frame::ReadError), + + #[error("connection error")] + ConnectionError(String), + + #[error("IO Error: {0}")] + IO(#[from] std::io::Error), + + #[error("Connection is poisoned: {0}")] + Poisoned(Arc), + + #[error( + "Data left at the end of the message. Got {message_size} bytes but only read {read} bytes. api_key={api_key:?} api_version={api_version}" + )] + TooMuchData { + message_size: u64, + read: u64, + api_key: ApiKey, + api_version: ApiVersion, + }, +} diff --git a/fluss-rust/crates/fluss/src/rpc/frame.rs b/fluss-rust/crates/fluss/src/rpc/frame.rs new file mode 100644 index 0000000000..44dadc9408 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/frame.rs @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; + +#[derive(Error, Debug)] +#[non_exhaustive] +pub enum ReadError { + #[error("Cannot read data: {0}")] + IO(#[from] std::io::Error), + + #[error("Negative message size: {size}")] + NegativeMessageSize { size: i32 }, + + #[error("Message too large, limit is {limit} bytes but got {actual} bytes")] + MessageTooLarge { limit: usize, actual: usize }, +} + +pub trait AsyncMessageRead { + fn read_message( + &mut self, + max_message_size: usize, + ) -> impl Future, ReadError>> + Send; +} + +impl AsyncMessageRead for R +where + R: AsyncRead + Send + Unpin, +{ + async fn read_message(&mut self, max_message_size: usize) -> Result, ReadError> { + let mut len_buf = [0u8; 4]; + self.read_exact(&mut len_buf).await?; + let len = i32::from_be_bytes(len_buf); + + let len = usize::try_from(len).map_err(|_| ReadError::NegativeMessageSize { size: len })?; + // check max message size to not blow up memory + if len > max_message_size { + // We need to seek so that next message is readable. However `self.seek` would require `R: AsyncSeek` which + // doesn't hold for many types we want to work with. So do some manual seeking. + let mut to_read = len; + let mut buf = vec![]; // allocate empty buffer + while to_read > 0 { + let step = max_message_size.min(to_read); + + // resize buffer if required + buf.resize(step, 0); + + self.read_exact(&mut buf).await?; + to_read -= step; + } + + return Err(ReadError::MessageTooLarge { + limit: max_message_size, + actual: len, + }); + } + + let mut buf = vec![0u8; len]; + self.read_exact(&mut buf).await?; + Ok(buf) + } +} + +#[derive(Error, Debug)] +#[non_exhaustive] +pub enum WriteError { + #[error("Cannot write data: {0}")] + IO(#[from] std::io::Error), + + #[error("Message too large: {size}")] + TooLarge { size: usize }, +} + +pub trait AsyncMessageWrite { + fn write_message(&mut self, msg: &[u8]) -> impl Future> + Send; +} + +impl AsyncMessageWrite for W +where + W: AsyncWrite + Send + Unpin, +{ + async fn write_message(&mut self, msg: &[u8]) -> Result<(), WriteError> { + let len = i32::try_from(msg.len()).map_err(|_| WriteError::TooLarge { size: msg.len() })?; + self.write_all(len.to_be_bytes().as_ref()).await?; + + if !msg.is_empty() { + self.write_all(msg).await?; + } + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/message/create_table.rs b/fluss-rust/crates/fluss/src/rpc/message/create_table.rs new file mode 100644 index 0000000000..5802e71797 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/create_table.rs @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::metadata::{JsonSerde, TableDescriptor, TablePath}; +use crate::{impl_read_version_type, impl_write_version_type, proto}; + +use crate::error::Result as FlussResult; +use crate::proto::CreateTableResponse; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::convert::to_table_path; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; + +use bytes::{Buf, BufMut}; +use prost::Message; + +#[derive(Debug)] +pub struct CreateTableRequest { + pub inner_request: proto::CreateTableRequest, +} + +impl CreateTableRequest { + pub fn new( + table_path: &TablePath, + table_descriptor: &TableDescriptor, + ignore_if_exists: bool, + ) -> FlussResult { + Ok(CreateTableRequest { + inner_request: proto::CreateTableRequest { + table_path: to_table_path(table_path), + table_json: serde_json::to_vec(&table_descriptor.serialize_json()?).unwrap(), + ignore_if_exists, + }, + }) + } +} + +impl RequestBody for CreateTableRequest { + type ResponseBody = CreateTableResponse; + + const API_KEY: ApiKey = ApiKey::CreateTable; + + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(CreateTableRequest); +impl_read_version_type!(CreateTableResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/fetch.rs b/fluss-rust/crates/fluss/src/rpc/message/fetch.rs new file mode 100644 index 0000000000..6ebc5a2b33 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/fetch.rs @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::proto::FetchLogResponse; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; +use crate::{impl_read_version_type, impl_write_version_type, proto}; +use prost::Message; + +use bytes::{Buf, BufMut}; + +#[allow(dead_code)] +const LOG_FETCH_MAX_BYTES: i32 = 16 * 1024 * 1024; +#[allow(dead_code)] +const LOG_FETCH_MIN_BYTES: i32 = 1; +#[allow(dead_code)] +const LOG_FETCH_WAIT_MAX_TIME: i32 = 500; + +pub struct FetchLogRequest { + pub inner_request: proto::FetchLogRequest, +} + +impl FetchLogRequest { + pub fn new(fetch_log_request: proto::FetchLogRequest) -> Self { + Self { + inner_request: fetch_log_request, + } + } +} + +impl RequestBody for FetchLogRequest { + type ResponseBody = FetchLogResponse; + + const API_KEY: ApiKey = ApiKey::FetchLog; + + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(FetchLogRequest); +impl_read_version_type!(FetchLogResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/get_table.rs b/fluss-rust/crates/fluss/src/rpc/message/get_table.rs new file mode 100644 index 0000000000..4f4d6c7a41 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/get_table.rs @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::proto::{GetTableInfoRequest, GetTableInfoResponse, PbTablePath}; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; + +use crate::metadata::TablePath; +use crate::{impl_read_version_type, impl_write_version_type}; +use bytes::{Buf, BufMut}; +use prost::Message; + +#[derive(Debug)] +pub struct GetTableRequest { + pub inner_request: GetTableInfoRequest, +} + +impl GetTableRequest { + pub fn new(table_path: &TablePath) -> Self { + let inner_request = GetTableInfoRequest { + table_path: PbTablePath { + database_name: table_path.database().to_owned(), + table_name: table_path.table().to_owned(), + }, + }; + + Self { inner_request } + } +} + +impl RequestBody for GetTableRequest { + type ResponseBody = GetTableInfoResponse; + const API_KEY: ApiKey = ApiKey::GetTable; + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(GetTableRequest); +impl_read_version_type!(GetTableInfoResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/header.rs b/fluss-rust/crates/fluss/src/rpc/message/header.rs new file mode 100644 index 0000000000..fe60f8c997 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/header.rs @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, WriteVersionedType}; +use bytes::{Buf, BufMut}; + +#[allow(dead_code)] +const REQUEST_HEADER_LENGTH: i32 = 8; +const SUCCESS_RESPONSE: u8 = 0; +#[allow(dead_code)] +const ERROR_RESPONSE: u8 = 1; +#[allow(dead_code)] +const SERVER_FAILURE: u8 = 2; + +#[derive(Debug, PartialEq, Eq)] +pub struct RequestHeader { + /// The API key of this request. + pub request_api_key: ApiKey, + + pub request_api_version: ApiVersion, + + pub request_id: i32, + + pub client_id: Option, +} + +impl WriteVersionedType for RequestHeader +where + W: BufMut, +{ + fn write_versioned(&self, writer: &mut W, _version: ApiVersion) -> Result<(), WriteError> { + writer.put_i16(self.request_api_key.into()); + writer.put_i16(self.request_api_version.0); + writer.put_i32(self.request_id); + Ok(()) + } +} + +#[derive(Debug, PartialEq, Eq)] +pub struct ResponseHeader { + pub request_id: i32, +} + +impl ReadVersionedType for ResponseHeader +where + R: Buf, +{ + fn read_versioned(reader: &mut R, _version: ApiVersion) -> Result { + let resp_type = reader.get_u8(); + if resp_type != SUCCESS_RESPONSE { + todo!("handle unsuccess response type"); + } + let request_id = reader.get_i32(); + Ok(ResponseHeader { request_id }) + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/message/mod.rs b/fluss-rust/crates/fluss/src/rpc/message/mod.rs new file mode 100644 index 0000000000..742c39369d --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/mod.rs @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use bytes::{Buf, BufMut}; + +mod create_table; +mod fetch; +mod get_table; +mod header; +mod produce_log; +mod update_metadata; + +pub use create_table::*; +pub use fetch::*; +pub use get_table::*; +pub use header::*; +pub use produce_log::*; +pub use update_metadata::*; + +pub trait RequestBody { + type ResponseBody; + + const API_KEY: ApiKey; + + const REQUEST_VERSION: ApiVersion; +} + +impl RequestBody for &T { + type ResponseBody = T::ResponseBody; + + const API_KEY: ApiKey = T::API_KEY; + + const REQUEST_VERSION: ApiVersion = T::REQUEST_VERSION; +} + +pub trait WriteVersionedType: Sized +where + W: BufMut, +{ + fn write_versioned(&self, writer: &mut W, version: ApiVersion) -> Result<(), WriteError>; +} + +pub trait ReadVersionedType: Sized +where + R: Buf, +{ + fn read_versioned(reader: &mut R, version: ApiVersion) -> Result; +} + +#[macro_export] +macro_rules! impl_write_version_type { + ($type:ty) => { + impl WriteVersionedType for $type + where + W: BufMut, + { + fn write_versioned( + &self, + writer: &mut W, + _version: ApiVersion, + ) -> Result<(), WriteError> { + Ok(self.inner_request.encode(writer).unwrap()) + } + } + }; +} + +#[macro_export] +macro_rules! impl_read_version_type { + ($type:ty) => { + impl ReadVersionedType for $type + where + R: Buf, + { + fn read_versioned(reader: &mut R, _version: ApiVersion) -> Result { + Ok(<$type>::decode(reader).unwrap()) + } + } + }; +} diff --git a/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs b/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs new file mode 100644 index 0000000000..7da2b59a2d --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Result as FlussResult; +use crate::proto::{PbProduceLogReqForBucket, ProduceLogResponse}; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; +use crate::{impl_read_version_type, impl_write_version_type, proto}; +use std::sync::Arc; + +use crate::client::ReadyWriteBatch; +use bytes::{Buf, BufMut}; +use prost::Message; + +pub struct ProduceLogRequest { + pub inner_request: proto::ProduceLogRequest, +} + +impl ProduceLogRequest { + pub fn new( + table_id: i64, + ack: i16, + max_request_timeout_ms: i32, + ready_batches: Vec<&Arc>, + ) -> FlussResult { + let mut request = proto::ProduceLogRequest { + table_id, + acks: ack as i32, + timeout_ms: max_request_timeout_ms, + ..Default::default() + }; + for ready_batch in ready_batches { + request.buckets_req.push(PbProduceLogReqForBucket { + partition_id: ready_batch.table_bucket.partition_id(), + bucket_id: ready_batch.table_bucket.bucket_id(), + records: ready_batch.write_batch.build()?, + }) + } + + Ok(ProduceLogRequest { + inner_request: request, + }) + } +} + +impl RequestBody for ProduceLogRequest { + type ResponseBody = ProduceLogResponse; + + const API_KEY: ApiKey = ApiKey::ProduceLog; + + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(ProduceLogRequest); +impl_read_version_type!(ProduceLogResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/update_metadata.rs b/fluss-rust/crates/fluss/src/rpc/message/update_metadata.rs new file mode 100644 index 0000000000..0d8ad6464a --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/update_metadata.rs @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::proto::{MetadataResponse, PbTablePath}; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; + +use crate::metadata::TablePath; +use crate::{impl_read_version_type, impl_write_version_type, proto}; +use bytes::{Buf, BufMut}; +use prost::Message; + +pub struct UpdateMetadataRequest { + pub inner_request: proto::MetadataRequest, +} + +impl UpdateMetadataRequest { + pub fn new(table_paths: &[&TablePath]) -> Self { + UpdateMetadataRequest { + inner_request: proto::MetadataRequest { + table_path: table_paths + .iter() + .map(|path| PbTablePath { + database_name: path.database().to_string(), + table_name: path.table().to_string(), + }) + .collect(), + partitions_path: vec![], + partitions_id: vec![], + }, + } + } +} + +impl RequestBody for UpdateMetadataRequest { + type ResponseBody = MetadataResponse; + + const API_KEY: ApiKey = ApiKey::MetaData; + + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(UpdateMetadataRequest); +impl_read_version_type!(MetadataResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/mod.rs b/fluss-rust/crates/fluss/src/rpc/mod.rs new file mode 100644 index 0000000000..496c015073 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/mod.rs @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod api_key; +mod api_version; +mod error; +mod frame; +pub mod message; +pub use error::*; +mod server_connection; +pub use server_connection::*; +mod convert; +mod transport; + +pub use message::*; + +pub use convert::*; diff --git a/fluss-rust/crates/fluss/src/rpc/server_connection.rs b/fluss-rust/crates/fluss/src/rpc/server_connection.rs new file mode 100644 index 0000000000..a102aa3ba6 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/server_connection.rs @@ -0,0 +1,402 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::cluster::ServerNode; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::error::RpcError; +use crate::rpc::error::RpcError::ConnectionError; +use crate::rpc::frame::{AsyncMessageRead, AsyncMessageWrite}; +use crate::rpc::message::{ + ReadVersionedType, RequestBody, RequestHeader, ResponseHeader, WriteVersionedType, +}; +use crate::rpc::transport::Transport; +use futures::future::BoxFuture; +use parking_lot::{Mutex, RwLock}; +use std::collections::HashMap; +use std::io::Cursor; +use std::ops::DerefMut; +use std::sync::Arc; +use std::sync::atomic::{AtomicI32, Ordering}; +use std::task::Poll; +use std::time::Duration; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufStream, WriteHalf}; +use tokio::sync::Mutex as AsyncMutex; +use tokio::sync::oneshot::{Sender, channel}; +use tokio::task::JoinHandle; +use tracing::warn; + +pub type MessengerTransport = ServerConnectionInner>; + +pub type ServerConnection = Arc; + +#[derive(Debug, Default)] +pub struct RpcClient { + connections: RwLock>, + client_id: Arc, + timeout: Option, + max_message_size: usize, +} + +impl RpcClient { + pub fn new() -> Self { + RpcClient { + connections: Default::default(), + client_id: Arc::from(""), + timeout: None, + max_message_size: usize::MAX, + } + } + + pub async fn get_connection( + &self, + server_node: &ServerNode, + ) -> Result { + let server_id = server_node.uid(); + { + let connections = self.connections.read(); + if let Some(connection) = connections.get(server_id) { + return Ok(connection.clone()); + } + } + + let new_server = self.connect(server_node).await?; + self.connections + .write() + .insert(server_id.clone(), new_server.clone()); + + Ok(new_server) + } + + async fn connect(&self, server_node: &ServerNode) -> Result { + let url = server_node.url(); + let transport = Transport::connect(&url, self.timeout) + .await + .map_err(|error| ConnectionError(error.to_string()))?; + + let messenger = ServerConnectionInner::new( + BufStream::new(transport), + self.max_message_size, + self.client_id.clone(), + ); + Ok(ServerConnection::new(messenger)) + } +} + +#[derive(Debug)] +struct Response { + #[allow(dead_code)] + header: ResponseHeader, + data: Cursor>, +} + +#[derive(Debug)] +struct ActiveRequest { + channel: Sender>, +} + +#[derive(Debug)] +enum ConnectionState { + /// Currently active requests by request ID. + /// + /// An active request is one that got prepared or send but the response wasn't received yet. + RequestMap(HashMap), + + /// One or our streams died and we are unable to process any more requests. + Poison(Arc), +} + +impl ConnectionState { + fn poison(&mut self, err: RpcError) -> Arc { + match self { + Self::RequestMap(map) => { + let err = Arc::new(err); + + // inform all active requests + for (_request_id, active_request) in map.drain() { + // it's OK if the other side is gone + active_request + .channel + .send(Err(RpcError::Poisoned(Arc::clone(&err)))) + .ok(); + } + *self = Self::Poison(Arc::clone(&err)); + err + } + Self::Poison(e) => { + // already poisoned, used existing error + Arc::clone(e) + } + } + } +} + +#[derive(Debug)] +pub struct ServerConnectionInner { + /// The half of the stream that we use to send data TO the broker. + /// + /// This will be used by [`request`](Self::request) to queue up messages. + stream_write: Arc>>, + + client_id: Arc, + + request_id: AtomicI32, + + state: Arc>, + + join_handle: JoinHandle<()>, +} + +impl ServerConnectionInner +where + RW: AsyncRead + AsyncWrite + Send + 'static, +{ + pub fn new(stream: RW, max_message_size: usize, client_id: Arc) -> Self { + let (stream_read, stream_write) = tokio::io::split(stream); + let state = Arc::new(Mutex::new(ConnectionState::RequestMap(HashMap::default()))); + let state_captured = Arc::clone(&state); + + let join_handle = tokio::spawn(async move { + let mut stream_read = stream_read; + loop { + match stream_read.read_message(max_message_size).await { + Ok(msg) => { + // message was read, so all subsequent errors should not poison the whole stream + let mut cursor = Cursor::new(msg); + let header = + match ResponseHeader::read_versioned(&mut cursor, ApiVersion(0)) { + Ok(header) => header, + Err(e) => { + warn!(%e, "Cannot read message header, ignoring message"); + continue; + } + }; + + let active_request = match state_captured.lock().deref_mut() { + ConnectionState::RequestMap(map) => { + match map.remove(&header.request_id) { + Some(active_request) => active_request, + _ => { + warn!( + request_id = header.request_id, + "Got response for unknown request", + ); + continue; + } + } + } + ConnectionState::Poison(_) => { + // stream is poisoned, no need to anything + return; + } + }; + + // we don't care if the other side is gone + active_request + .channel + .send(Ok(Response { + header, + data: cursor, + })) + .ok(); + } + Err(e) => { + state_captured.lock().poison(RpcError::ReadMessageError(e)); + return; + } + } + } + }); + + Self { + stream_write: Arc::new(AsyncMutex::new(stream_write)), + client_id, + request_id: AtomicI32::new(0), + state, + join_handle, + } + } + + pub async fn request(&self, msg: R) -> Result + where + R: RequestBody + Send + WriteVersionedType>, + R::ResponseBody: ReadVersionedType>>, + { + let request_id = self.request_id.fetch_add(1, Ordering::SeqCst); + let header = RequestHeader { + request_api_key: R::API_KEY, + request_api_version: ApiVersion(0), + request_id, + client_id: Some(String::from(self.client_id.as_ref())), + }; + + let header_version = ApiVersion(0); + + let body_api_version = ApiVersion(0); + + let mut buf = Vec::new(); + // write header + header.write_versioned(&mut buf, header_version)?; + // write message body + msg.write_versioned(&mut buf, body_api_version)?; + + let (tx, rx) = channel(); + + // to prevent stale data in inner state, ensure that we would remove the request again if we are cancelled while + // sending the request + let _cleanup_on_cancel = + CleanupRequestStateOnCancel::new(Arc::clone(&self.state), request_id); + + match self.state.lock().deref_mut() { + ConnectionState::RequestMap(map) => { + map.insert(request_id, ActiveRequest { channel: tx }); + } + ConnectionState::Poison(e) => return Err(RpcError::Poisoned(Arc::clone(e))), + } + + self.send_message(buf).await?; + _cleanup_on_cancel.message_sent(); + let mut response = rx.await.expect("Who closed this channel?!")?; + + let body = R::ResponseBody::read_versioned(&mut response.data, body_api_version)?; + + let read_bytes = response.data.position(); + let message_bytes = response.data.into_inner().len() as u64; + if read_bytes != message_bytes { + return Err(RpcError::TooMuchData { + message_size: message_bytes, + read: read_bytes, + api_key: R::API_KEY, + api_version: body_api_version, + }); + } + Ok(body) + } + + async fn send_message(&self, msg: Vec) -> Result<(), RpcError> { + match self.send_message_inner(msg).await { + Ok(()) => Ok(()), + Err(e) => { + // need to poison the stream because message framing might be out-of-sync + let mut state = self.state.lock(); + Err(RpcError::Poisoned(state.poison(e))) + } + } + } + + async fn send_message_inner(&self, msg: Vec) -> Result<(), RpcError> { + let mut stream_write = Arc::clone(&self.stream_write).lock_owned().await; + + // use a wrapper so that cancellation doesn't cancel the send operation and leaves half-send messages on the wire + let fut = CancellationSafeFuture::new(async move { + stream_write.write_message(&msg).await?; + stream_write.flush().await?; + Ok(()) + }); + + fut.await + } +} + +impl Drop for ServerConnectionInner { + fn drop(&mut self) { + // todo: should remove from server_connections map? + self.join_handle.abort(); + } +} + +struct CancellationSafeFuture +where + F: Future + Send + 'static, +{ + /// Mark if the inner future finished. If not, we must spawn a helper task on drop. + done: bool, + + /// Inner future. + /// + /// Wrapped in an `Option` so we can extract it during drop. Inside that option however we also need a pinned + /// box because once this wrapper is polled, it will be pinned in memory -- even during drop. Now the inner + /// future does not necessarily implement `Unpin`, so we need a heap allocation to pin it in memory even when we + /// move it out of this option. + inner: Option>, +} + +impl CancellationSafeFuture +where + F: Future + Send, +{ + fn new(fut: F) -> Self { + Self { + done: false, + inner: Some(Box::pin(fut)), + } + } +} + +impl Future for CancellationSafeFuture +where + F: Future + Send, +{ + type Output = F::Output; + + fn poll( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll { + match self.inner.as_mut().expect("no dropped").as_mut().poll(cx) { + Poll::Ready(res) => { + self.done = true; + Poll::Ready(res) + } + Poll::Pending => Poll::Pending, + } + } +} + +/// Helper that ensures that a request is removed when a request is cancelled before it was actually sent out. +struct CleanupRequestStateOnCancel { + state: Arc>, + request_id: i32, + message_sent: bool, +} + +impl CleanupRequestStateOnCancel { + /// Create new helper. + /// + /// You must call [`message_sent`](Self::message_sent) when the request was sent. + fn new(state: Arc>, request_id: i32) -> Self { + Self { + state, + request_id, + message_sent: false, + } + } + + /// Request was sent. Do NOT clean the state any longer. + fn message_sent(mut self) { + self.message_sent = true; + } +} + +impl Drop for CleanupRequestStateOnCancel { + fn drop(&mut self) { + if !self.message_sent { + if let ConnectionState::RequestMap(map) = self.state.lock().deref_mut() { + map.remove(&self.request_id); + } + } + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/transport.rs b/fluss-rust/crates/fluss/src/rpc/transport.rs new file mode 100644 index 0000000000..a6f721f6aa --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/transport.rs @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::rpc::error::RpcError; +use std::ops::DerefMut; +use std::pin::Pin; +use std::task::{Context, Poll}; +use std::time::Duration; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use tokio::net::TcpStream; + +#[derive(Debug)] +pub enum Transport { + Plain { inner: TcpStream }, +} + +impl AsyncRead for Transport { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + match self.deref_mut() { + Self::Plain { inner } => Pin::new(inner).poll_read(cx, buf), + } + } +} + +impl AsyncWrite for Transport { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + match self.deref_mut() { + Self::Plain { inner } => Pin::new(inner).poll_write(cx, buf), + } + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match self.deref_mut() { + Self::Plain { inner } => Pin::new(inner).poll_flush(cx), + } + } + + fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match self.deref_mut() { + Self::Plain { inner } => Pin::new(inner).poll_shutdown(cx), + } + } +} + +impl Transport { + pub async fn connect(server: &str, timeout: Option) -> Result { + let tcp_stream = Self::connect_timeout(server, timeout).await?; + Ok(Transport::Plain { inner: tcp_stream }) + } + + async fn connect_timeout(host: &str, timeout: Option) -> Result { + match timeout { + Some(timeout) => Ok(tokio::time::timeout(timeout, TcpStream::connect(host)) + .await + .map_err(|_| { + RpcError::ConnectionError(format!("Timeout connecting to host {host}")) + })??), + None => Ok(TcpStream::connect(host).await?), + } + } +} diff --git a/fluss-rust/crates/fluss/src/util/mod.rs b/fluss-rust/crates/fluss/src/util/mod.rs new file mode 100644 index 0000000000..c26b4ec43a --- /dev/null +++ b/fluss-rust/crates/fluss/src/util/mod.rs @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::metadata::TableBucket; +use linked_hash_map::LinkedHashMap; +use std::collections::{HashMap, HashSet}; +use std::hash::Hash; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; + +pub fn current_time_ms() -> i64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as i64 +} + +pub struct FairBucketStatusMap { + map: LinkedHashMap>, + size: usize, +} + +#[allow(dead_code)] +impl FairBucketStatusMap { + pub fn new() -> Self { + Self { + map: LinkedHashMap::new(), + size: 0, + } + } + + /// Moves the bucket to the end of the iteration order + pub fn move_to_end(&mut self, table_bucket: TableBucket) + where + TableBucket: Eq + Hash, + { + if let Some(status) = self.map.remove(&table_bucket) { + self.map.insert(table_bucket, status); + } + } + + /// Updates the status and moves the bucket to the end + pub fn update_and_move_to_end(&mut self, table_bucket: TableBucket, status: S) + where + TableBucket: Eq + Hash, + { + self.map.remove(&table_bucket); + self.map.insert(table_bucket, Arc::new(status)); + self.update_size(); + } + + /// Updates the status without changing the order + pub fn update(&mut self, table_bucket: TableBucket, status: Arc) + where + TableBucket: Eq + Hash, + { + self.map.insert(table_bucket, status); + self.update_size(); + } + + /// Removes a bucket + pub fn remove(&mut self, table_bucket: &TableBucket) + where + TableBucket: Eq + Hash, + { + self.map.remove(table_bucket); + self.update_size(); + } + + /// Returns an immutable view of all buckets + pub fn bucket_set(&self) -> HashSet<&TableBucket> + where + TableBucket: Eq + Hash, + { + self.map.keys().collect() + } + + /// Clears all buckets + pub fn clear(&mut self) { + self.map.clear(); + self.update_size(); + } + + /// Checks if a bucket exists + pub fn contains(&self, table_bucket: &TableBucket) -> bool + where + TableBucket: Eq + Hash, + { + self.map.contains_key(table_bucket) + } + + /// Returns an immutable view of the bucket-status map + pub fn bucket_status_map(&self) -> &LinkedHashMap> { + &self.map + } + + /// Returns status values in current order + pub fn bucket_status_values(&self) -> Vec<&Arc> { + self.map.values().collect() + } + + /// Gets the status for a bucket + pub fn status_value(&self, table_bucket: &TableBucket) -> Option<&Arc> + where + TableBucket: Eq + Hash, + { + self.map.get(table_bucket) + } + + /// Applies a function to each bucket-status pair + pub fn for_each(&self, mut f: F) + where + F: FnMut(&TableBucket, &S), + { + for (bucket, status) in &self.map { + f(bucket, status); + } + } + + /// Gets the current bucket count (thread-safe) + pub fn size(&self) -> usize { + self.size + } + + pub fn set(&mut self, bucket_to_status: HashMap>) + where + TableBucket: Eq + Hash + Clone, + S: Clone, + { + self.map.clear(); + + // Group buckets by table ID + let mut table_to_buckets: LinkedHashMap> = LinkedHashMap::new(); + for bucket in bucket_to_status.keys() { + table_to_buckets + .entry(bucket.table_id()) + .or_default() + .push(bucket.clone()); + } + + // Insert buckets grouped by table + for (_, buckets) in table_to_buckets { + for bucket in buckets { + if let Some(status) = bucket_to_status.get(&bucket) { + self.map.insert(bucket, status.clone()); + } + } + } + + self.update_size(); + } + + fn update_size(&mut self) { + self.size = self.map.len() + } +} + +impl Default for FairBucketStatusMap { + fn default() -> Self { + Self::new() + } +} diff --git a/fluss-rust/crates/fluss/tests/integration/client/mod.rs b/fluss-rust/crates/fluss/tests/integration/client/mod.rs new file mode 100644 index 0000000000..567c358cea --- /dev/null +++ b/fluss-rust/crates/fluss/tests/integration/client/mod.rs @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[test] +fn test() { + println!("Running integration tests"); +} diff --git a/fluss-rust/crates/fluss/tests/test_fluss.rs b/fluss-rust/crates/fluss/tests/test_fluss.rs new file mode 100644 index 0000000000..7840638c74 --- /dev/null +++ b/fluss-rust/crates/fluss/tests/test_fluss.rs @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[cfg(feature = "integration_tests")] +extern crate fluss; + +#[cfg(feature = "integration_tests")] +mod integration { + + mod client; +} diff --git a/fluss-rust/rust-toolchain.toml b/fluss-rust/rust-toolchain.toml new file mode 100644 index 0000000000..56c3bf5df8 --- /dev/null +++ b/fluss-rust/rust-toolchain.toml @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[toolchain] +channel = "stable" +components = ["rustfmt", "clippy"] \ No newline at end of file diff --git a/fluss-rust/rustfmt.toml b/fluss-rust/rustfmt.toml new file mode 100644 index 0000000000..18d114826f --- /dev/null +++ b/fluss-rust/rustfmt.toml @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +edition = "2024" +reorder_imports = true \ No newline at end of file From 40cfb79e6457ea05040b834745869e1cf98638df Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Tue, 26 Aug 2025 21:13:43 +0800 Subject: [PATCH 005/287] [hotfix] Ignore some file license check (#4) --- fluss-rust/.licenserc.yaml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 fluss-rust/.licenserc.yaml diff --git a/fluss-rust/.licenserc.yaml b/fluss-rust/.licenserc.yaml new file mode 100644 index 0000000000..3813b484b3 --- /dev/null +++ b/fluss-rust/.licenserc.yaml @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +header: + license: + spdx-id: Apache-2.0 + copyright-owner: Apache Software Foundation + + paths-ignore: + - '.github/PULL_REQUEST_TEMPLATE.md' + - '.gitignore' + - 'LICENSE' + - 'NOTICE' + - 'DISCLAIMER' + comment: on-failure From 69b0ead1a31bb3f7f06fea895f48aed583e5b382 Mon Sep 17 00:00:00 2001 From: naivedogger <59598718+naivedogger@users.noreply.github.com> Date: Thu, 11 Sep 2025 16:25:12 +0800 Subject: [PATCH 006/287] [feat] Add more functions to the Rust client Admin (#5) --- .../{dependabot.yml => dependabot.yml} | 0 fluss-rust/crates/fluss/src/client/admin.rs | 133 +++++++++- fluss-rust/crates/fluss/src/client/mod.rs | 6 +- .../crates/fluss/src/client/table/mod.rs | 5 +- .../fluss/src/client/write/accumulator.rs | 6 +- .../crates/fluss/src/cluster/cluster.rs | 1 + .../crates/fluss/src/metadata/database.rs | 234 ++++++++++++++++++ .../crates/fluss/src/metadata/datatype.rs | 12 + fluss-rust/crates/fluss/src/metadata/mod.rs | 4 +- fluss-rust/crates/fluss/src/metadata/table.rs | 65 +++++ .../crates/fluss/src/proto/fluss_api.proto | 83 +++++++ fluss-rust/crates/fluss/src/record/arrow.rs | 7 +- fluss-rust/crates/fluss/src/row/datum.rs | 103 +++++--- fluss-rust/crates/fluss/src/row/mod.rs | 3 +- fluss-rust/crates/fluss/src/rpc/api_key.rs | 39 ++- .../fluss/src/rpc/message/create_database.rs | 67 +++++ .../fluss/src/rpc/message/database_exists.rs | 49 ++++ .../fluss/src/rpc/message/drop_database.rs | 51 ++++ .../fluss/src/rpc/message/drop_table.rs | 56 +++++ .../src/rpc/message/get_database_info.rs | 49 ++++ .../rpc/message/get_latest_lake_snapshot.rs | 55 ++++ .../fluss/src/rpc/message/list_databases.rs | 47 ++++ .../fluss/src/rpc/message/list_tables.rs | 53 ++++ .../crates/fluss/src/rpc/message/mod.rs | 18 ++ .../fluss/src/rpc/message/table_exists.rs | 55 ++++ .../crates/fluss/src/rpc/server_connection.rs | 1 - 26 files changed, 1144 insertions(+), 58 deletions(-) rename fluss-rust/.github/{dependabot.yml => dependabot.yml} (100%) create mode 100644 fluss-rust/crates/fluss/src/metadata/database.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/create_database.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/database_exists.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/drop_database.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/drop_table.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/get_database_info.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/get_latest_lake_snapshot.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/list_databases.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/list_tables.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/table_exists.rs diff --git a/fluss-rust/.github/dependabot.yml b/fluss-rust/.github/dependabot.yml similarity index 100% rename from fluss-rust/.github/dependabot.yml rename to fluss-rust/.github/dependabot.yml diff --git a/fluss-rust/crates/fluss/src/client/admin.rs b/fluss-rust/crates/fluss/src/client/admin.rs index 8688a2d844..2584034a69 100644 --- a/fluss-rust/crates/fluss/src/client/admin.rs +++ b/fluss-rust/crates/fluss/src/client/admin.rs @@ -16,18 +16,28 @@ // under the License. use crate::client::metadata::Metadata; -use crate::metadata::{JsonSerde, TableDescriptor, TableInfo, TablePath}; -use crate::rpc::message::{CreateTableRequest, GetTableRequest}; +use crate::metadata::{ + DatabaseDescriptor, DatabaseInfo, JsonSerde, LakeSnapshot, TableBucket, TableDescriptor, + TableInfo, TablePath, +}; +use crate::rpc::message::{ + CreateDatabaseRequest, CreateTableRequest, DatabaseExistsRequest, DropDatabaseRequest, + DropTableRequest, GetDatabaseInfoRequest, GetLatestLakeSnapshotRequest, GetTableRequest, + ListDatabasesRequest, ListTablesRequest, TableExistsRequest, +}; use crate::rpc::{RpcClient, ServerConnection}; + +use std::collections::HashMap; use std::sync::Arc; use crate::error::Result; use crate::proto::GetTableInfoResponse; -#[allow(dead_code)] pub struct FlussAdmin { admin_gateway: ServerConnection, + #[allow(dead_code)] metadata: Arc, + #[allow(dead_code)] rpc_client: Arc, } @@ -49,6 +59,23 @@ impl FlussAdmin { }) } + pub async fn create_database( + &self, + database_name: &str, + ignore_if_exists: bool, + database_descriptor: Option<&DatabaseDescriptor>, + ) -> Result<()> { + let _response = self + .admin_gateway + .request(CreateDatabaseRequest::new( + database_name, + ignore_if_exists, + database_descriptor, + )?) + .await?; + Ok(()) + } + pub async fn create_table( &self, table_path: &TablePath, @@ -66,6 +93,14 @@ impl FlussAdmin { Ok(()) } + pub async fn drop_table(&self, table_path: &TablePath, ignore_if_exists: bool) -> Result<()> { + let _response = self + .admin_gateway + .request(DropTableRequest::new(table_path, ignore_if_exists)) + .await?; + Ok(()) + } + pub async fn get_table(&self, table_path: &TablePath) -> Result { let response = self .admin_gateway @@ -90,4 +125,96 @@ impl FlussAdmin { modified_time, )) } + + /// List all tables in the given database + pub async fn list_tables(&self, database_name: &str) -> Result> { + let response = self + .admin_gateway + .request(ListTablesRequest::new(database_name)) + .await?; + Ok(response.table_name) + } + + /// Check if a table exists + pub async fn table_exists(&self, table_path: &TablePath) -> Result { + let response = self + .admin_gateway + .request(TableExistsRequest::new(table_path)) + .await?; + Ok(response.exists) + } + + /// Drop a database + pub async fn drop_database( + &self, + database_name: &str, + ignore_if_not_exists: bool, + cascade: bool, + ) -> Result<()> { + let _response = self + .admin_gateway + .request(DropDatabaseRequest::new( + database_name, + ignore_if_not_exists, + cascade, + )) + .await?; + Ok(()) + } + + /// List all databases + pub async fn list_databases(&self) -> Result> { + let response = self + .admin_gateway + .request(ListDatabasesRequest::new()) + .await?; + Ok(response.database_name) + } + + /// Check if a database exists + pub async fn database_exists(&self, database_name: &str) -> Result { + let response = self + .admin_gateway + .request(DatabaseExistsRequest::new(database_name)) + .await?; + Ok(response.exists) + } + + /// Get database information + pub async fn get_database_info(&self, database_name: &str) -> Result { + let request = GetDatabaseInfoRequest::new(database_name); + let response = self.admin_gateway.request(request).await?; + + // Convert proto response to DatabaseInfo + let database_descriptor = DatabaseDescriptor::from_json_bytes(&response.database_json)?; + + Ok(DatabaseInfo::new( + database_name.to_string(), + database_descriptor, + response.created_time, + response.modified_time, + )) + } + + /// Get the latest lake snapshot for a table + pub async fn get_latest_lake_snapshot(&self, table_path: &TablePath) -> Result { + let response = self + .admin_gateway + .request(GetLatestLakeSnapshotRequest::new(table_path)) + .await?; + + // Convert proto response to LakeSnapshot + let mut table_buckets_offset = HashMap::new(); + for bucket_snapshot in response.bucket_snapshots { + let table_bucket = TableBucket::new(response.table_id, bucket_snapshot.bucket_id); + if let Some(log_offset) = bucket_snapshot.log_offset { + table_buckets_offset.insert(table_bucket, log_offset); + } + } + + Ok(LakeSnapshot::new( + response.snapshot_id, + table_buckets_offset, + )) + } } diff --git a/fluss-rust/crates/fluss/src/client/mod.rs b/fluss-rust/crates/fluss/src/client/mod.rs index 5b6908eec1..a971439258 100644 --- a/fluss-rust/crates/fluss/src/client/mod.rs +++ b/fluss-rust/crates/fluss/src/client/mod.rs @@ -17,10 +17,12 @@ mod admin; mod connection; +mod metadata; mod table; mod write; +pub use admin::*; pub use connection::*; -mod metadata; - +pub use metadata::*; +pub use table::*; pub use write::*; diff --git a/fluss-rust/crates/fluss/src/client/table/mod.rs b/fluss-rust/crates/fluss/src/client/table/mod.rs index 503a1edb39..4d6f8f045b 100644 --- a/fluss-rust/crates/fluss/src/client/table/mod.rs +++ b/fluss-rust/crates/fluss/src/client/table/mod.rs @@ -17,8 +17,6 @@ use crate::client::connection::FlussConnection; use crate::client::metadata::Metadata; -use crate::client::table::append::TableAppend; -use crate::client::table::scanner::TableScan; use crate::metadata::{TableInfo, TablePath}; use std::sync::Arc; @@ -29,6 +27,9 @@ mod append; mod scanner; mod writer; +pub use append::TableAppend; +pub use scanner::TableScan; + #[allow(dead_code)] pub struct FlussTable<'a> { conn: &'a FlussConnection, diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs index 0b77894025..32622c7b2d 100644 --- a/fluss-rust/crates/fluss/src/client/write/accumulator.rs +++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs @@ -93,13 +93,15 @@ impl RecordAccumulator { } let table_path = &record.table_path; - + let table_info = cluster.get_table(table_path); let row_type = &cluster.get_table(table_path).row_type; + let schema_id = table_info.schema_id; + let mut batch = ArrowLog(ArrowLogWriteBatch::new( self.batch_id.fetch_add(1, Ordering::Relaxed), table_path.as_ref().clone(), - 0, + schema_id, row_type, bucket_id, current_time_ms(), diff --git a/fluss-rust/crates/fluss/src/cluster/cluster.rs b/fluss-rust/crates/fluss/src/cluster/cluster.rs index 1f8341dd66..a6f20a8262 100644 --- a/fluss-rust/crates/fluss/src/cluster/cluster.rs +++ b/fluss-rust/crates/fluss/src/cluster/cluster.rs @@ -126,6 +126,7 @@ impl Cluster { table_metadata.modified_time, ); table_info_by_path.insert(table_path.clone(), table_info); + table_id_by_path.insert(table_path.clone(), table_id); // now, get bucket matadata let mut found_unavailable_bucket = false; diff --git a/fluss-rust/crates/fluss/src/metadata/database.rs b/fluss-rust/crates/fluss/src/metadata/database.rs new file mode 100644 index 0000000000..2649421d6f --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/database.rs @@ -0,0 +1,234 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::JsonSerdeError; +use crate::error::Result; +use crate::metadata::JsonSerde; +use serde::{Deserialize, Serialize}; +use serde_json::{Value, json}; +use std::collections::HashMap; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct DatabaseDescriptor { + comment: Option, + custom_properties: HashMap, +} + +#[derive(Debug, Clone)] +pub struct DatabaseInfo { + database_name: String, + database_descriptor: DatabaseDescriptor, + created_time: i64, + modified_time: i64, +} + +impl DatabaseInfo { + pub fn new( + database_name: String, + database_descriptor: DatabaseDescriptor, + created_time: i64, + modified_time: i64, + ) -> Self { + Self { + database_name, + database_descriptor, + created_time, + modified_time, + } + } + + pub fn database_name(&self) -> &str { + &self.database_name + } + + pub fn database_descriptor(&self) -> &DatabaseDescriptor { + &self.database_descriptor + } + + pub fn created_time(&self) -> i64 { + self.created_time + } + + pub fn modified_time(&self) -> i64 { + self.modified_time + } +} + +#[derive(Debug, Default)] +pub struct DatabaseDescriptorBuilder { + comment: Option, + custom_properties: HashMap, +} + +impl DatabaseDescriptor { + pub fn builder() -> DatabaseDescriptorBuilder { + DatabaseDescriptorBuilder::default() + } + + pub fn comment(&self) -> Option<&str> { + self.comment.as_deref() + } + + pub fn custom_properties(&self) -> &HashMap { + &self.custom_properties + } +} + +impl DatabaseDescriptorBuilder { + pub fn comment(mut self, comment: &str) -> Self { + self.comment = Some(comment.to_string()); + self + } + + pub fn custom_properties(mut self, properties: HashMap) -> Self { + self.custom_properties = properties; + self + } + + pub fn custom_property(mut self, key: &str, value: &str) -> Self { + self.custom_properties + .insert(key.to_string(), value.to_string()); + self + } + + pub fn build(self) -> Result { + Ok(DatabaseDescriptor { + comment: self.comment, + custom_properties: self.custom_properties, + }) + } +} + +impl DatabaseDescriptor { + const CUSTOM_PROPERTIES_NAME: &'static str = "custom_properties"; + const COMMENT_NAME: &'static str = "comment"; + const VERSION_KEY: &'static str = "version"; + const VERSION: u32 = 1; +} + +impl JsonSerde for DatabaseDescriptor { + fn serialize_json(&self) -> Result { + let mut obj = serde_json::Map::new(); + + // Serialize version + obj.insert(Self::VERSION_KEY.to_string(), json!(Self::VERSION)); + + // Serialize comment if present + if let Some(comment) = self.comment() { + obj.insert(Self::COMMENT_NAME.to_string(), json!(comment)); + } + + // Serialize custom properties + obj.insert( + Self::CUSTOM_PROPERTIES_NAME.to_string(), + json!(self.custom_properties()), + ); + + Ok(Value::Object(obj)) + } + + fn deserialize_json(node: &Value) -> Result { + let mut builder = DatabaseDescriptor::builder(); + + // Deserialize comment if present + if let Some(comment_node) = node.get(Self::COMMENT_NAME) { + let comment = comment_node + .as_str() + .ok_or_else(|| { + JsonSerdeError(format!("{} should be a string", Self::COMMENT_NAME)) + })? + .to_owned(); + builder = builder.comment(&comment); + } + + // Deserialize custom properties directly + let custom_properties = if let Some(props_node) = node.get(Self::CUSTOM_PROPERTIES_NAME) { + let obj = props_node.as_object().ok_or_else(|| { + JsonSerdeError("Custom properties should be an object".to_string()) + })?; + + let mut properties = HashMap::with_capacity(obj.len()); + for (key, value) in obj { + properties.insert( + key.clone(), + value + .as_str() + .ok_or_else(|| { + JsonSerdeError("Property value should be a string".to_string()) + })? + .to_owned(), + ); + } + properties + } else { + HashMap::new() + }; + builder = builder.custom_properties(custom_properties); + + builder.build() + } +} + +impl DatabaseDescriptor { + /// Create DatabaseDescriptor from JSON bytes (equivalent to Java's fromJsonBytes) + pub fn from_json_bytes(bytes: &[u8]) -> Result { + let json_value: Value = serde_json::from_slice(bytes) + .map_err(|e| JsonSerdeError(format!("Failed to parse JSON: {}", e)))?; + Self::deserialize_json(&json_value) + } + + /// Convert DatabaseDescriptor to JSON bytes + pub fn to_json_bytes(&self) -> Result> { + let json_value = self.serialize_json()?; + serde_json::to_vec(&json_value) + .map_err(|e| JsonSerdeError(format!("Failed to serialize to JSON: {}", e))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_database_descriptor_json_serde() { + let mut custom_props = HashMap::new(); + custom_props.insert("key1".to_string(), "value1".to_string()); + custom_props.insert("key2".to_string(), "value2".to_string()); + + let descriptor = DatabaseDescriptor::builder() + .comment("Test database") + .custom_properties(custom_props) + .build() + .unwrap(); + + // Test serialization + let json_bytes = descriptor.to_json_bytes().unwrap(); + println!("Serialized JSON: {}", String::from_utf8_lossy(&json_bytes)); + + // Test deserialization + let deserialized = DatabaseDescriptor::from_json_bytes(&json_bytes).unwrap(); + assert_eq!(descriptor, deserialized); + } + + #[test] + fn test_empty_database_descriptor() { + let descriptor = DatabaseDescriptor::builder().build().unwrap(); + let json_bytes = descriptor.to_json_bytes().unwrap(); + let deserialized = DatabaseDescriptor::from_json_bytes(&json_bytes).unwrap(); + assert_eq!(descriptor, deserialized); + } +} diff --git a/fluss-rust/crates/fluss/src/metadata/datatype.rs b/fluss-rust/crates/fluss/src/metadata/datatype.rs index 0c00c6f08e..09ca0c2c57 100644 --- a/fluss-rust/crates/fluss/src/metadata/datatype.rs +++ b/fluss-rust/crates/fluss/src/metadata/datatype.rs @@ -591,6 +591,10 @@ impl ArrayType { element_type: self.element_type.clone(), } } + + pub fn get_element_type(&self) -> &DataType { + &self.element_type + } } #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] @@ -620,6 +624,14 @@ impl MapType { value_type: self.value_type.clone(), } } + + pub fn key_type(&self) -> &DataType { + &self.key_type + } + + pub fn value_type(&self) -> &DataType { + &self.value_type + } } #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] diff --git a/fluss-rust/crates/fluss/src/metadata/mod.rs b/fluss-rust/crates/fluss/src/metadata/mod.rs index 79465474f7..87540071f3 100644 --- a/fluss-rust/crates/fluss/src/metadata/mod.rs +++ b/fluss-rust/crates/fluss/src/metadata/mod.rs @@ -15,10 +15,12 @@ // specific language governing permissions and limitations // under the License. +mod database; mod datatype; -pub use datatype::*; mod json_serde; mod table; +pub use database::*; +pub use datatype::*; pub use json_serde::*; pub use table::*; diff --git a/fluss-rust/crates/fluss/src/metadata/table.rs b/fluss-rust/crates/fluss/src/metadata/table.rs index a5ab61d67e..90e3573eb8 100644 --- a/fluss-rust/crates/fluss/src/metadata/table.rs +++ b/fluss-rust/crates/fluss/src/metadata/table.rs @@ -584,6 +584,16 @@ impl Display for LogFormat { } } +impl LogFormat { + pub fn parse(s: &str) -> Result { + match s.to_uppercase().as_str() { + "ARROW" => Ok(LogFormat::ARROW), + "INDEXED" => Ok(LogFormat::INDEXED), + _ => Err(InvalidTableError(format!("Unknown log format: {}", s))), + } + } +} + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum KvFormat { INDEXED, @@ -600,6 +610,16 @@ impl Display for KvFormat { } } +impl KvFormat { + pub fn parse(s: &str) -> Result { + match s.to_uppercase().as_str() { + "INDEXED" => Ok(KvFormat::INDEXED), + "COMPACTED" => Ok(KvFormat::COMPACTED), + _ => Err(InvalidTableError(format!("Unknown kv format: {}", s))), + } + } +} + #[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize)] pub struct TablePath { database: String, @@ -631,6 +651,28 @@ impl TablePath { } } +#[derive(Debug, Clone)] +pub struct PhysicalTablePath { + table_path: TablePath, + #[allow(dead_code)] + partition: Option, +} + +impl PhysicalTablePath { + pub fn of(table_path: TablePath) -> Self { + Self { + table_path, + partition: None, + } + } + + // TODO: support partition + + pub fn get_table_path(&self) -> &TablePath { + &self.table_path + } +} + #[derive(Debug, Clone)] pub struct TableInfo { pub table_path: TablePath, @@ -918,3 +960,26 @@ impl TableBucket { self.partition_id } } + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LakeSnapshot { + pub snapshot_id: i64, + pub table_buckets_offset: HashMap, +} + +impl LakeSnapshot { + pub fn new(snapshot_id: i64, table_buckets_offset: HashMap) -> Self { + Self { + snapshot_id, + table_buckets_offset, + } + } + + pub fn snapshot_id(&self) -> i64 { + self.snapshot_id + } + + pub fn table_buckets_offset(&self) -> &HashMap { + &self.table_buckets_offset + } +} diff --git a/fluss-rust/crates/fluss/src/proto/fluss_api.proto b/fluss-rust/crates/fluss/src/proto/fluss_api.proto index 195b8f824c..d71197b2b0 100644 --- a/fluss-rust/crates/fluss/src/proto/fluss_api.proto +++ b/fluss-rust/crates/fluss/src/proto/fluss_api.proto @@ -123,6 +123,21 @@ message CreateTableRequest { message CreateTableResponse { } +message DropTableRequest { + required PbTablePath table_path = 1; + required bool ignore_if_not_exists = 2; +} + +message DropTableResponse { +} + +message TableExistsRequest { + required PbTablePath table_path = 1; +} + +message TableExistsResponse { + required bool exists = 1; +} message GetTableInfoRequest { required PbTablePath table_path = 1; @@ -136,6 +151,57 @@ message GetTableInfoResponse { required int64 modified_time = 5; } +message ListTablesRequest { + required string database_name = 1; +} + +message ListTablesResponse { + repeated string table_name = 1; +} + +message CreateDatabaseRequest { + required string database_name = 1; + required bool ignore_if_exists = 2; + optional bytes database_json = 3; +} + +message CreateDatabaseResponse { +} + +message GetDatabaseInfoRequest { + required string database_name = 1; +} + +message GetDatabaseInfoResponse { + required bytes database_json = 3; + required int64 created_time = 4; + required int64 modified_time = 5; +} + +message DropDatabaseRequest { + required string database_name = 1; + required bool ignore_if_not_exists = 2; + required bool cascade = 3; +} + +message DropDatabaseResponse { +} + +message DatabaseExistsRequest { + required string database_name = 1; +} + +message DatabaseExistsResponse { + required bool exists = 1; +} + +message ListDatabasesRequest { +} + +message ListDatabasesResponse { + repeated string database_name = 1; +} + // fetch log request and response message FetchLogRequest { @@ -194,4 +260,21 @@ message PbRemoteLogSegment { required int64 remote_log_start_offset = 2; required int64 remote_log_end_offset = 3; required int32 segment_size_in_bytes = 4; +} + +// fetch latest lake snapshot +message GetLatestLakeSnapshotRequest { + required PbTablePath table_path = 1; +} + +message GetLatestLakeSnapshotResponse { + required int64 table_id = 1; + required int64 snapshotId = 2; + repeated PbLakeSnapshotForBucket bucket_snapshots = 3; +} + +message PbLakeSnapshotForBucket { + optional int64 partition_id = 1; + required int32 bucket_id = 2; + optional int64 log_offset = 3; } \ No newline at end of file diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index 2f595d0304..fa63b00603 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -16,9 +16,9 @@ // under the License. use arrow::array::{ - ArrayBuilder, ArrayRef, BooleanBuilder, Float32Builder, Float64Builder, Int8Builder, - Int16Builder, Int32Builder, Int64Builder, StringBuilder, UInt8Builder, UInt16Builder, - UInt32Builder, UInt64Builder, + ArrayBuilder, ArrayRef, BinaryBuilder, BooleanBuilder, Float32Builder, Float64Builder, + Int8Builder, Int16Builder, Int32Builder, Int64Builder, StringBuilder, UInt8Builder, + UInt16Builder, UInt32Builder, UInt64Builder, }; use arrow::{ array::RecordBatch, @@ -224,6 +224,7 @@ impl MemoryLogRecordsArrowBuilder { arrow_schema::DataType::Float64 => Box::new(Float64Builder::new()), arrow_schema::DataType::Boolean => Box::new(BooleanBuilder::new()), arrow_schema::DataType::Utf8 => Box::new(StringBuilder::new()), + arrow_schema::DataType::Binary => Box::new(BinaryBuilder::new()), dt => panic!("Unsupported data type: {dt:?}"), } } diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs index 3c65a7d4ee..d8c4f748ca 100644 --- a/fluss-rust/crates/fluss/src/row/datum.rs +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -19,7 +19,10 @@ use chrono::Datelike; use crate::error::Error::RowConvertError; use crate::error::Result; -use arrow::array::{ArrayBuilder, Int8Builder, Int16Builder, Int32Builder, StringBuilder}; +use arrow::array::{ + ArrayBuilder, BinaryBuilder, BooleanBuilder, Float32Builder, Float64Builder, Int8Builder, + Int16Builder, Int32Builder, Int64Builder, StringBuilder, +}; use chrono::NaiveDate; use ordered_float::OrderedFloat; use parse_display::Display; @@ -47,6 +50,8 @@ pub enum Datum<'a> { #[display("{0}")] Int64(i64), #[display("{0}")] + Float32(F32), + #[display("{0}")] Float64(F64), #[display("'{0}'")] String(&'a str), @@ -96,6 +101,20 @@ impl From> for Datum<'_> { } } +impl<'a> From for Datum<'a> { + #[inline] + fn from(f: f32) -> Datum<'a> { + Datum::Float32(F32::from(f)) + } +} + +impl<'a> From for Datum<'a> { + #[inline] + fn from(f: f64) -> Datum<'a> { + Datum::Float64(F64::from(f)) + } +} + impl TryFrom<&Datum<'_>> for i32 { type Error = (); @@ -126,45 +145,56 @@ pub trait ToArrow { impl Datum<'_> { pub fn append_to(&self, builder: &mut dyn ArrayBuilder) -> Result<()> { + macro_rules! append_null_to_arrow { + ($builder_type:ty) => { + if let Some(b) = builder.as_any_mut().downcast_mut::<$builder_type>() { + b.append_null(); + return Ok(()); + } + }; + } + + macro_rules! append_value_to_arrow { + ($builder_type:ty, $value:expr) => { + if let Some(b) = builder.as_any_mut().downcast_mut::<$builder_type>() { + b.append_value($value); + return Ok(()); + } + }; + } + match self { Datum::Null => { - todo!() - } - Datum::Bool(_v) => { - todo!() - } - Datum::Int16(_v) => { - todo!() - } - Datum::Int32(v) => { - v.append_to(builder)?; - } - Datum::Int64(_v) => { - todo!() - } - Datum::Float64(_v) => { - todo!() - } - Datum::String(v) => { - v.append_to(builder)?; + append_null_to_arrow!(BooleanBuilder); + append_null_to_arrow!(Int16Builder); + append_null_to_arrow!(Int32Builder); + append_null_to_arrow!(Int64Builder); + append_null_to_arrow!(Float32Builder); + append_null_to_arrow!(Float64Builder); + append_null_to_arrow!(StringBuilder); + append_null_to_arrow!(BinaryBuilder); } - Datum::Blob(_v) => { - todo!() - } - Datum::Decimal(_v) => { - todo!() - } - Datum::Date(_v) => { - todo!() - } - Datum::Timestamp(_v) => { - todo!() - } - Datum::TimestampTz(_v) => { - todo!() + Datum::Bool(v) => append_value_to_arrow!(BooleanBuilder, *v), + Datum::Int16(v) => append_value_to_arrow!(Int16Builder, *v), + Datum::Int32(v) => append_value_to_arrow!(Int32Builder, *v), + Datum::Int64(v) => append_value_to_arrow!(Int64Builder, *v), + Datum::Float32(v) => append_value_to_arrow!(Float32Builder, v.into_inner()), + Datum::Float64(v) => append_value_to_arrow!(Float64Builder, v.into_inner()), + Datum::String(v) => append_value_to_arrow!(StringBuilder, *v), + Datum::Blob(v) => append_value_to_arrow!(BinaryBuilder, v.as_ref()), + Datum::Decimal(_) | Datum::Date(_) | Datum::Timestamp(_) | Datum::TimestampTz(_) => { + return Err(RowConvertError(format!( + "Type {:?} is not yet supported for Arrow conversion", + std::mem::discriminant(self) + ))); } } - Ok(()) + + Err(RowConvertError(format!( + "Cannot append {:?} to builder of type {}", + self, + std::any::type_name_of_val(builder) + ))) } } @@ -190,9 +220,10 @@ macro_rules! impl_to_arrow { impl_to_arrow!(i8, Int8Builder); impl_to_arrow!(i16, Int16Builder); impl_to_arrow!(i32, Int32Builder); +impl_to_arrow!(f32, Float32Builder); +impl_to_arrow!(f64, Float64Builder); impl_to_arrow!(&str, StringBuilder); -#[allow(dead_code)] pub type F32 = OrderedFloat; pub type F64 = OrderedFloat; #[allow(dead_code)] diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index ead6ff0067..b900cb51d5 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -15,13 +15,12 @@ // specific language governing permissions and limitations // under the License. -use crate::row::datum::Datum; - mod column; mod datum; pub use column::*; +pub use datum::*; pub trait InternalRow { /// Returns the number of fields in this row diff --git a/fluss-rust/crates/fluss/src/rpc/api_key.rs b/fluss-rust/crates/fluss/src/rpc/api_key.rs index 49282084ef..18ce44fbef 100644 --- a/fluss-rust/crates/fluss/src/rpc/api_key.rs +++ b/fluss-rust/crates/fluss/src/rpc/api_key.rs @@ -19,22 +19,40 @@ use crate::rpc::api_key::ApiKey::Unknown; #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)] pub enum ApiKey { + CreateDatabase, + DropDatabase, + ListDatabases, + DatabaseExists, CreateTable, + DropTable, + GetTable, + ListTables, + TableExists, + MetaData, ProduceLog, FetchLog, - MetaData, - GetTable, + GetDatabaseInfo, + GetLatestLakeSnapshot, Unknown(i16), } impl From for ApiKey { fn from(key: i16) -> Self { match key { + 1001 => ApiKey::CreateDatabase, + 1002 => ApiKey::DropDatabase, + 1003 => ApiKey::ListDatabases, + 1004 => ApiKey::DatabaseExists, 1005 => ApiKey::CreateTable, + 1006 => ApiKey::DropTable, + 1007 => ApiKey::GetTable, + 1008 => ApiKey::ListTables, + 1010 => ApiKey::TableExists, + 1012 => ApiKey::MetaData, 1014 => ApiKey::ProduceLog, 1015 => ApiKey::FetchLog, - 1012 => ApiKey::MetaData, - 1007 => ApiKey::GetTable, + 1032 => ApiKey::GetLatestLakeSnapshot, + 1035 => ApiKey::GetDatabaseInfo, _ => Unknown(key), } } @@ -43,11 +61,20 @@ impl From for ApiKey { impl From for i16 { fn from(key: ApiKey) -> Self { match key { + ApiKey::CreateDatabase => 1001, + ApiKey::DropDatabase => 1002, + ApiKey::ListDatabases => 1003, + ApiKey::DatabaseExists => 1004, ApiKey::CreateTable => 1005, - ApiKey::ProduceLog => 1014, - ApiKey::MetaData => 1012, + ApiKey::DropTable => 1006, ApiKey::GetTable => 1007, + ApiKey::ListTables => 1008, + ApiKey::TableExists => 1010, + ApiKey::MetaData => 1012, + ApiKey::ProduceLog => 1014, ApiKey::FetchLog => 1015, + ApiKey::GetLatestLakeSnapshot => 1032, + ApiKey::GetDatabaseInfo => 1035, Unknown(x) => x, } } diff --git a/fluss-rust/crates/fluss/src/rpc/message/create_database.rs b/fluss-rust/crates/fluss/src/rpc/message/create_database.rs new file mode 100644 index 0000000000..e4052ef361 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/create_database.rs @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::metadata::DatabaseDescriptor; +use crate::{impl_read_version_type, impl_write_version_type, proto}; + +use crate::error::Result as FlussResult; +use crate::proto::CreateDatabaseResponse; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; + +use bytes::{Buf, BufMut}; +use prost::Message; + +#[derive(Debug)] +pub struct CreateDatabaseRequest { + pub inner_request: proto::CreateDatabaseRequest, +} + +impl CreateDatabaseRequest { + pub fn new( + database_name: &str, + ignore_if_exists: bool, + database_descriptor: Option<&DatabaseDescriptor>, + ) -> FlussResult { + let database_json = if let Some(descriptor) = database_descriptor { + Some(descriptor.to_json_bytes()?) + } else { + None + }; + + Ok(CreateDatabaseRequest { + inner_request: proto::CreateDatabaseRequest { + database_name: database_name.to_string(), + ignore_if_exists, + database_json, + }, + }) + } +} + +impl RequestBody for CreateDatabaseRequest { + type ResponseBody = CreateDatabaseResponse; + + const API_KEY: ApiKey = ApiKey::CreateDatabase; + + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(CreateDatabaseRequest); +impl_read_version_type!(CreateDatabaseResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/database_exists.rs b/fluss-rust/crates/fluss/src/rpc/message/database_exists.rs new file mode 100644 index 0000000000..795eea1260 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/database_exists.rs @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; +use crate::{impl_read_version_type, impl_write_version_type, proto}; +use bytes::{Buf, BufMut}; +use prost::Message; + +#[derive(Debug)] +pub struct DatabaseExistsRequest { + pub inner_request: proto::DatabaseExistsRequest, +} + +impl DatabaseExistsRequest { + pub fn new(database_name: &str) -> Self { + DatabaseExistsRequest { + inner_request: proto::DatabaseExistsRequest { + database_name: database_name.to_string(), + }, + } + } +} + +impl RequestBody for DatabaseExistsRequest { + type ResponseBody = proto::DatabaseExistsResponse; + + const API_KEY: ApiKey = ApiKey::DatabaseExists; + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(DatabaseExistsRequest); +impl_read_version_type!(proto::DatabaseExistsResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/drop_database.rs b/fluss-rust/crates/fluss/src/rpc/message/drop_database.rs new file mode 100644 index 0000000000..49cbfaf8d4 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/drop_database.rs @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; +use crate::{impl_read_version_type, impl_write_version_type, proto}; +use bytes::{Buf, BufMut}; +use prost::Message; + +#[derive(Debug)] +pub struct DropDatabaseRequest { + pub inner_request: proto::DropDatabaseRequest, +} + +impl DropDatabaseRequest { + pub fn new(database_name: &str, ignore_if_not_exists: bool, cascade: bool) -> Self { + DropDatabaseRequest { + inner_request: proto::DropDatabaseRequest { + database_name: database_name.to_string(), + ignore_if_not_exists, + cascade, + }, + } + } +} + +impl RequestBody for DropDatabaseRequest { + type ResponseBody = proto::DropDatabaseResponse; + + const API_KEY: ApiKey = ApiKey::DropDatabase; + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(DropDatabaseRequest); +impl_read_version_type!(proto::DropDatabaseResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/drop_table.rs b/fluss-rust/crates/fluss/src/rpc/message/drop_table.rs new file mode 100644 index 0000000000..0dbc21bbbe --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/drop_table.rs @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::metadata::TablePath; +use crate::{impl_read_version_type, impl_write_version_type, proto}; + +use crate::proto::DropTableResponse; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::convert::to_table_path; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; + +use bytes::{Buf, BufMut}; +use prost::Message; + +#[derive(Debug)] +pub struct DropTableRequest { + pub inner_request: proto::DropTableRequest, +} + +impl DropTableRequest { + pub fn new(table_path: &TablePath, ignore_if_not_exists: bool) -> Self { + DropTableRequest { + inner_request: proto::DropTableRequest { + table_path: to_table_path(table_path), + ignore_if_not_exists, + }, + } + } +} + +impl RequestBody for DropTableRequest { + type ResponseBody = DropTableResponse; + + const API_KEY: ApiKey = ApiKey::DropTable; + + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(DropTableRequest); +impl_read_version_type!(DropTableResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/get_database_info.rs b/fluss-rust/crates/fluss/src/rpc/message/get_database_info.rs new file mode 100644 index 0000000000..85492a8bf0 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/get_database_info.rs @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; +use crate::{impl_read_version_type, impl_write_version_type, proto}; +use bytes::{Buf, BufMut}; +use prost::Message; + +#[derive(Debug)] +pub struct GetDatabaseInfoRequest { + pub inner_request: proto::GetDatabaseInfoRequest, +} + +impl GetDatabaseInfoRequest { + pub fn new(database_name: &str) -> Self { + GetDatabaseInfoRequest { + inner_request: proto::GetDatabaseInfoRequest { + database_name: database_name.to_string(), + }, + } + } +} + +impl RequestBody for GetDatabaseInfoRequest { + type ResponseBody = proto::GetDatabaseInfoResponse; + + const API_KEY: ApiKey = ApiKey::GetDatabaseInfo; + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(GetDatabaseInfoRequest); +impl_read_version_type!(proto::GetDatabaseInfoResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/get_latest_lake_snapshot.rs b/fluss-rust/crates/fluss/src/rpc/message/get_latest_lake_snapshot.rs new file mode 100644 index 0000000000..a0e186efd2 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/get_latest_lake_snapshot.rs @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::proto; +use crate::proto::PbTablePath; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; + +use crate::metadata::TablePath; +use crate::{impl_read_version_type, impl_write_version_type}; +use bytes::{Buf, BufMut}; +use prost::Message; + +#[derive(Debug)] +pub struct GetLatestLakeSnapshotRequest { + pub inner_request: proto::GetLatestLakeSnapshotRequest, +} + +impl GetLatestLakeSnapshotRequest { + pub fn new(table_path: &TablePath) -> Self { + let inner_request = proto::GetLatestLakeSnapshotRequest { + table_path: PbTablePath { + database_name: table_path.database().to_string(), + table_name: table_path.table().to_string(), + }, + }; + + Self { inner_request } + } +} + +impl RequestBody for GetLatestLakeSnapshotRequest { + type ResponseBody = proto::GetLatestLakeSnapshotResponse; + const API_KEY: ApiKey = ApiKey::GetLatestLakeSnapshot; + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(GetLatestLakeSnapshotRequest); +impl_read_version_type!(proto::GetLatestLakeSnapshotResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/list_databases.rs b/fluss-rust/crates/fluss/src/rpc/message/list_databases.rs new file mode 100644 index 0000000000..ce5a091540 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/list_databases.rs @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; +use crate::{impl_read_version_type, impl_write_version_type, proto}; +use bytes::{Buf, BufMut}; +use prost::Message; + +#[derive(Debug, Default)] +pub struct ListDatabasesRequest { + pub inner_request: proto::ListDatabasesRequest, +} + +impl ListDatabasesRequest { + pub fn new() -> Self { + ListDatabasesRequest { + inner_request: proto::ListDatabasesRequest {}, + } + } +} + +impl RequestBody for ListDatabasesRequest { + type ResponseBody = proto::ListDatabasesResponse; + + const API_KEY: ApiKey = ApiKey::ListDatabases; + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(ListDatabasesRequest); +impl_read_version_type!(proto::ListDatabasesResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/list_tables.rs b/fluss-rust/crates/fluss/src/rpc/message/list_tables.rs new file mode 100644 index 0000000000..daf57ea6b5 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/list_tables.rs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{impl_read_version_type, impl_write_version_type, proto}; + +use crate::proto::ListTablesResponse; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; + +use bytes::{Buf, BufMut}; +use prost::Message; + +#[derive(Debug)] +pub struct ListTablesRequest { + pub inner_request: proto::ListTablesRequest, +} + +impl ListTablesRequest { + pub fn new(database_name: &str) -> Self { + ListTablesRequest { + inner_request: proto::ListTablesRequest { + database_name: database_name.to_string(), + }, + } + } +} + +impl RequestBody for ListTablesRequest { + type ResponseBody = ListTablesResponse; + + const API_KEY: ApiKey = ApiKey::ListTables; + + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(ListTablesRequest); +impl_read_version_type!(ListTablesResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/mod.rs b/fluss-rust/crates/fluss/src/rpc/message/mod.rs index 742c39369d..d5f8ebde89 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/mod.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/mod.rs @@ -20,18 +20,36 @@ use crate::rpc::api_version::ApiVersion; use crate::rpc::frame::{ReadError, WriteError}; use bytes::{Buf, BufMut}; +mod create_database; mod create_table; +mod database_exists; +mod drop_database; +mod drop_table; mod fetch; +mod get_database_info; +mod get_latest_lake_snapshot; mod get_table; mod header; +mod list_databases; +mod list_tables; mod produce_log; +mod table_exists; mod update_metadata; +pub use create_database::*; pub use create_table::*; +pub use database_exists::*; +pub use drop_database::*; +pub use drop_table::*; pub use fetch::*; +pub use get_database_info::*; +pub use get_latest_lake_snapshot::*; pub use get_table::*; pub use header::*; +pub use list_databases::*; +pub use list_tables::*; pub use produce_log::*; +pub use table_exists::*; pub use update_metadata::*; pub trait RequestBody { diff --git a/fluss-rust/crates/fluss/src/rpc/message/table_exists.rs b/fluss-rust/crates/fluss/src/rpc/message/table_exists.rs new file mode 100644 index 0000000000..3b71f471ac --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/table_exists.rs @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::metadata::TablePath; +use crate::{impl_read_version_type, impl_write_version_type, proto}; + +use crate::proto::TableExistsResponse; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::convert::to_table_path; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; + +use bytes::{Buf, BufMut}; +use prost::Message; + +#[derive(Debug)] +pub struct TableExistsRequest { + pub inner_request: proto::TableExistsRequest, +} + +impl TableExistsRequest { + pub fn new(table_path: &TablePath) -> Self { + TableExistsRequest { + inner_request: proto::TableExistsRequest { + table_path: to_table_path(table_path), + }, + } + } +} + +impl RequestBody for TableExistsRequest { + type ResponseBody = TableExistsResponse; + + const API_KEY: ApiKey = ApiKey::TableExists; + + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(TableExistsRequest); +impl_read_version_type!(TableExistsResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/server_connection.rs b/fluss-rust/crates/fluss/src/rpc/server_connection.rs index a102aa3ba6..4eeda46063 100644 --- a/fluss-rust/crates/fluss/src/rpc/server_connection.rs +++ b/fluss-rust/crates/fluss/src/rpc/server_connection.rs @@ -72,7 +72,6 @@ impl RpcClient { return Ok(connection.clone()); } } - let new_server = self.connect(server_node).await?; self.connections .write() From 4779328d24a274d98e29933785c78ffec819bb5b Mon Sep 17 00:00:00 2001 From: naivedogger <59598718+naivedogger@users.noreply.github.com> Date: Tue, 16 Sep 2025 11:54:04 +0800 Subject: [PATCH 007/287] [feat] Set up uv, ruff, and other settings for Python binding (#7) --- fluss-rust/bindings/python/Cargo.toml | 38 ++++++ fluss-rust/bindings/python/README.md | 149 ++++++++++++++++++++++ fluss-rust/bindings/python/pyproject.toml | 96 ++++++++++++++ 3 files changed, 283 insertions(+) create mode 100644 fluss-rust/bindings/python/Cargo.toml create mode 100644 fluss-rust/bindings/python/README.md create mode 100644 fluss-rust/bindings/python/pyproject.toml diff --git a/fluss-rust/bindings/python/Cargo.toml b/fluss-rust/bindings/python/Cargo.toml new file mode 100644 index 0000000000..aee1a21314 --- /dev/null +++ b/fluss-rust/bindings/python/Cargo.toml @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "fluss_python" +edition = "2024" +version = "0.1.0" +license = "apache-2.0" +rust-version = "1.85" + +[lib] +name = "fluss" +crate-type = ["cdylib"] + +[workspace] + +[dependencies] +pyo3 = { version = "0.24", features = ["extension-module"] } +fluss = { path = "../../crates/fluss" } +tokio = { workspace = true } +arrow = { workspace = true } +arrow-pyarrow = "55.1.0" +pyo3-async-runtimes = { version = "0.24.0", features = ["tokio-runtime"] } +chrono = { workspace = true } diff --git a/fluss-rust/bindings/python/README.md b/fluss-rust/bindings/python/README.md new file mode 100644 index 0000000000..5258f53291 --- /dev/null +++ b/fluss-rust/bindings/python/README.md @@ -0,0 +1,149 @@ + + +# Apache Fluss™ Python Bindings + +Python bindings for Fluss using PyO3 and Maturin. + +## API Overview + +### Basic Usage + +TODO: Add basic usage examples here + +### Core Classes + +#### `Config` + +Configuration for Fluss connection parameters + +#### `FlussConnection` + +Main interface for connecting to Fluss cluster + +#### `FlussAdmin` + +Administrative operations for managing tables (create, delete, etc.) + +#### `FlussTable` + +Represents a Fluss table, providing read and write operations + +#### `TableWriter` + +Used for writing data to tables, supports PyArrow and Pandas + +#### `LogScanner` + +Used for scanning table log data + +## Development + +## Requirements + +- Python 3.9+ +- Rust 1.70+ +- [uv](https://docs.astral.sh/uv/) package manager +- Linux or MacOS + +> **⚠️ Before you start:** +> Please make sure you can successfully build and run the [Fluss Rust client](../../crates/fluss/README.md) on your machine. +> The Python bindings require a working Fluss Rust backend and compatible environment. + +### Install Development Dependencies + +```bash +cd bindings/python +uv sync --all-extras +``` + +### Build Development Version + +```bash +source .venv/bin/activate +uv run maturin develop +``` + +### Build Release Version + +```bash +uv run maturin build --release +``` + +### Code Formatting and Linting + +```bash +uv run ruff format python/ +uv run ruff check python/ +``` + +### Type Checking + +```bash +uv run mypy python/ +``` + +### Run Examples + +```bash +uv run python example/example.py +``` + +### Build API docs: + +```bash +uv run pdoc fluss_python +``` + +### Release + +```bash +# Build wheel +uv run maturin build --release + +# Publish to PyPI +uv run maturin publish +``` + +## Project Structure +``` +bindings/python/ +├── Cargo.toml # Rust dependency configuration +├── pyproject.toml # Python project configuration +├── README.md # This file +├── src/ # Rust source code +│ ├── lib.rs # Main entry module +│ ├── config.rs # Configuration related +│ ├── connection.rs # Connection management +│ ├── admin.rs # Admin operations +│ ├── table.rs # Table operations +│ ├── types.rs # Data types +│ └── error.rs # Error handling +├── python/ # Python package source +│ └── fluss_python/ +│ ├── __init__.py # Python package entry +│ ├── __init__.pyi # Stub file +│ └── py.typed # Type declarations +└── example/ # Example code + └── example.py +``` + +## License + +Apache 2.0 License diff --git a/fluss-rust/bindings/python/pyproject.toml b/fluss-rust/bindings/python/pyproject.toml new file mode 100644 index 0000000000..fe9d58878d --- /dev/null +++ b/fluss-rust/bindings/python/pyproject.toml @@ -0,0 +1,96 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[build-system] +requires = ["maturin>=1.0,<2.0"] +build-backend = "maturin" + +[project] +name = "fluss" +description = "Python bindings for Fluss on fluss-rust with Pandas integration" +authors = [{name = "Fluss Team"}] +license = {text = "Apache-2.0"} +readme = "README.md" +requires-python = ">=3.9" +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +dynamic = ["version"] + +dependencies = [ + "pandas>=2.3.1", + "pyarrow>=10.0.0", +] + +[project.urls] +Repository = "https://github.com/apache/fluss-rust" + +[project.optional-dependencies] +dev = [ + "mypy>=1.17.1", + "pytest>=8.3.5", + "pytest-asyncio>=0.25.3", + "ruff>=0.9.10", + "maturin>=1.8.2", +] +docs = [ + "pdoc>=15.0.4", +] + +[tool.maturin] +python-source = "python" +module-name = "fluss._fluss" +features = ["pyo3/extension-module"] + +[tool.uv] +cache-keys = [ + { file = "pyproject.toml" }, + { file = "Cargo.toml" }, + { file = "src/**/*.rs" }, + { file = "../../crates/**/*.rs" }, +] + +[tool.ruff] +line-length = 88 +fix = true + +[tool.ruff.lint] +ignore = ["E402", "F403", "F405"] +select = ["E", "F", "I"] + +[tool.ruff.lint.pycodestyle] +max-doc-length = 88 + +[tool.ruff.lint.pydocstyle] +convention = "numpy" + +[tool.ruff.format] +docstring-code-format = true + +[tool.ruff.lint.isort] +known-first-party = ["fluss"] + +[tool.mypy] +python_version = "3.9" +warn_return_any = true +warn_unused_configs = true +ignore_missing_imports = true From bb8abafa13d6323d9c668dfedd9a6dcc1c3d1b21 Mon Sep 17 00:00:00 2001 From: naivedogger <59598718+naivedogger@users.noreply.github.com> Date: Wed, 17 Sep 2025 15:00:33 +0800 Subject: [PATCH 008/287] [feat] Create Python bindings for metadata (#8) --- fluss-rust/bindings/python/src/config.rs | 117 +++++ fluss-rust/bindings/python/src/metadata.rs | 581 +++++++++++++++++++++ 2 files changed, 698 insertions(+) create mode 100644 fluss-rust/bindings/python/src/config.rs create mode 100644 fluss-rust/bindings/python/src/metadata.rs diff --git a/fluss-rust/bindings/python/src/config.rs b/fluss-rust/bindings/python/src/config.rs new file mode 100644 index 0000000000..08b20b4d40 --- /dev/null +++ b/fluss-rust/bindings/python/src/config.rs @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::prelude::*; +use pyo3::types::PyDict; +use crate::*; + +/// Configuration for Fluss client +#[pyclass] +#[derive(Clone)] +pub struct Config { + inner: fcore::config::Config, +} + +#[pymethods] +impl Config { + /// Create a new Config with optional properties from a dictionary + #[new] + #[pyo3(signature = (properties = None))] + fn new(properties: Option<&Bound<'_, PyDict>>) -> PyResult { + let mut config = fcore::config::Config::default(); + + if let Some(props) = properties { + for item in props.iter() { + let key: String = item.0.extract()?; + let value: String = item.1.extract()?; + + match key.as_str() { + "bootstrap.servers" => { + config.bootstrap_server = Some(value); + }, + "request.max.size" => { + if let Ok(size) = value.parse::() { + config.request_max_size = size; + } + }, + "writer.acks" => { + config.writer_acks = value; + }, + "writer.retries" => { + if let Ok(retries) = value.parse::() { + config.writer_retries = retries; + } + }, + "writer.batch.size" => { + if let Ok(size) = value.parse::() { + config.writer_batch_size = size; + } + }, + _ => { + return Err(FlussError::new_err(format!("Unknown property: {}", key))); + } + } + } + } + + Ok(Self { + inner: config, + }) + } + + /// Get the bootstrap server + #[getter] + fn bootstrap_server(&self) -> Option { + self.inner.bootstrap_server.clone() + } + + /// Set the bootstrap server + #[setter] + fn set_bootstrap_server(&mut self, server: String) { + self.inner.bootstrap_server = Some(server); + } + + /// Get the request max size + #[getter] + fn request_max_size(&self) -> i32 { + self.inner.request_max_size + } + + /// Set the request max size + #[setter] + fn set_request_max_size(&mut self, size: i32) { + self.inner.request_max_size = size; + } + + /// Get the writer batch size + #[getter] + fn writer_batch_size(&self) -> i32 { + self.inner.writer_batch_size + } + + /// Set the writer batch size + #[setter] + fn set_writer_batch_size(&mut self, size: i32) { + self.inner.writer_batch_size = size; + } +} + +impl Config { + pub fn get_core_config(&self) -> fcore::config::Config { + self.inner.clone() + } +} diff --git a/fluss-rust/bindings/python/src/metadata.rs b/fluss-rust/bindings/python/src/metadata.rs new file mode 100644 index 0000000000..238dde2ec0 --- /dev/null +++ b/fluss-rust/bindings/python/src/metadata.rs @@ -0,0 +1,581 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::prelude::*; +use crate::*; +use pyo3::types::PyDict; +use std::collections::HashMap; + +/// Represents a table path with database and table name +#[pyclass] +#[derive(Clone)] +pub struct TablePath { + database_name: String, + table_name: String, +} + +#[pymethods] +impl TablePath { + /// Create a new TablePath + #[new] + pub fn new(database_name: String, table_name: String) -> Self { + Self { + database_name, + table_name, + } + } + + /// Get the database name + #[getter] + pub fn database_name(&self) -> String { + self.database_name.clone() + } + + /// Get the table name + #[getter] + pub fn table_name(&self) -> String { + self.table_name.clone() + } + + /// Get table path as string + pub fn table_path_str(&self) -> String { + format!("{}.{}", self.database_name, self.table_name) + } + + pub fn __str__(&self) -> String { + self.table_path_str() + } + + fn __repr__(&self) -> String { + format!("TablePath('{}', '{}')", self.database_name, self.table_name) + } + + /// Hash implementation for Python + pub fn __hash__(&self) -> u64 { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let mut hasher = DefaultHasher::new(); + self.database_name.hash(&mut hasher); + self.table_name.hash(&mut hasher); + hasher.finish() + } + + /// Equality implementation for Python + pub fn __eq__(&self, other: &TablePath) -> bool { + self.database_name == other.database_name + && self.table_name == other.table_name + } +} + +impl TablePath { + /// Convert to core TablePath + pub fn to_core(&self) -> fcore::metadata::TablePath { + fcore::metadata::TablePath::new(self.database_name.clone(), self.table_name.clone()) + } + + pub fn from_core(core_path: fcore::metadata::TablePath) -> Self { + Self { + database_name: core_path.database().to_string(), + table_name: core_path.table().to_string(), + } + } +} + +/// Schema wrapper for Fluss table schema +#[pyclass] +pub struct Schema { + __schema: fcore::metadata::Schema, +} + +#[pymethods] +impl Schema { + /// Create a new Schema from PyArrow schema with optional primary keys + #[new] + #[pyo3(signature = (schema, primary_keys=None))] + pub fn new( + schema: PyObject, // PyArrow schema + primary_keys: Option>, + ) -> PyResult { + let arrow_schema = crate::utils::Utils::pyarrow_to_arrow_schema(&schema)?; + + let mut builder = fcore::metadata::Schema::builder(); + + for field in arrow_schema.fields() { + let fluss_data_type = crate::utils::Utils::arrow_type_to_fluss_type(field.data_type())?; + builder = builder.column(field.name(), fluss_data_type); + + if let Some(comment) = field.metadata().get("comment") { + builder = builder.with_comment(comment); + } + } + + if let Some(pk_columns) = primary_keys { + if !pk_columns.is_empty() { + builder = builder.primary_key(pk_columns); + } + } + + let fluss_schema = builder.build() + .map_err(|e| FlussError::new_err(format!("Failed to build schema: {}", e)))?; + + Ok(Self { + __schema: fluss_schema, + }) + } + + /// Get column names + fn get_column_names(&self) -> Vec { + self.__schema.columns().iter().map(|col| col.name().to_string()).collect() + } + + /// Get column types + fn get_column_types(&self) -> Vec { + self.__schema.columns().iter() + .map(|col| Utils::datatype_to_string(col.data_type())) + .collect() + } + + /// Get columns as (name, type) pairs + fn get_columns(&self) -> Vec<(String, String)> { + self.__schema.columns().iter() + .map(|col| (col.name().to_string(), Utils::datatype_to_string(col.data_type()))) + .collect() + } + + // TODO: support primaryKey + + fn __str__(&self) -> String { + format!("Schema: columns={:?}", self.get_columns()) + } +} + +impl Schema { + /// Convert to core Schema + pub fn to_core(&self) -> &fcore::metadata::Schema { + &self.__schema + } +} + +/// Table distribution configuration +#[pyclass] +pub struct TableDistribution { + inner: fcore::metadata::TableDistribution, +} + +#[pymethods] +impl TableDistribution { + /// Get bucket keys + fn bucket_keys(&self) -> Vec { + self.inner.bucket_keys().to_vec() + } + + /// Get bucket count + fn bucket_count(&self) -> Option { + self.inner.bucket_count() + } +} + + +/// Table descriptor containing schema and metadata +#[pyclass] +#[derive(Clone)] +pub struct TableDescriptor { + __tbl_desc: fcore::metadata::TableDescriptor, +} + +#[pymethods] +impl TableDescriptor { + /// Create a new TableDescriptor + #[new] + #[pyo3(signature = (schema, **kwargs))] + pub fn new( + schema: &Schema, // fluss schema + kwargs: Option<&Bound<'_, PyDict>>, + ) -> PyResult { + let mut partition_keys = Vec::new(); + let mut bucket_count = None; + let mut bucket_keys = Vec::new(); + let mut properties = std::collections::HashMap::new(); + let mut custom_properties = std::collections::HashMap::new(); + let mut comment: Option = None; + let mut log_format = None; + let mut kv_format = None; + + if let Some(kwargs) = kwargs { + if let Ok(Some(pkeys)) = kwargs.get_item("partition_keys") { + partition_keys = pkeys.extract()?; + } + if let Ok(Some(bcount)) = kwargs.get_item("bucket_count") { + bucket_count = Some(bcount.extract()?); + } + if let Ok(Some(bkeys)) = kwargs.get_item("bucket_keys") { + bucket_keys = bkeys.extract()?; + } + if let Ok(Some(props)) = kwargs.get_item("properties") { + properties = props.extract()?; + } + if let Ok(Some(cprops)) = kwargs.get_item("custom_properties") { + custom_properties = cprops.extract()?; + } + if let Ok(Some(comm)) = kwargs.get_item("comment") { + comment = Some(comm.extract()?); + } + if let Ok(Some(lformat)) = kwargs.get_item("log_format") { + let format_str: String = lformat.extract()?; + log_format = Some(fcore::metadata::LogFormat::parse(&format_str) + .map_err(|e| FlussError::new_err(e.to_string()))?); + } + if let Ok(Some(kformat)) = kwargs.get_item("kv_format") { + let format_str: String = kformat.extract()?; + kv_format = Some(fcore::metadata::KvFormat::parse(&format_str) + .map_err(|e| FlussError::new_err(e.to_string()))?); + } + } + + let fluss_schema = schema.to_core().clone(); + + let mut builder = fcore::metadata::TableDescriptor::builder() + .schema(fluss_schema) + .properties(properties) + .custom_properties(custom_properties) + .partitioned_by(partition_keys) + .distributed_by(bucket_count, bucket_keys); + + if let Some(comment) = comment { + builder = builder.comment(&comment); + } + if let Some(log_format) = log_format { + builder = builder.log_format(log_format); + } + if let Some(kv_format) = kv_format { + builder = builder.kv_format(kv_format); + } + + let core_descriptor = builder.build() + .map_err(|e| FlussError::new_err(format!("Failed to build TableDescriptor: {}", e)))?; + + Ok(Self { + __tbl_desc: core_descriptor, + }) + } + + /// Get the schema of this table descriptor + pub fn get_schema(&self) -> PyResult { + Ok(Schema { + __schema: self.__tbl_desc.schema().clone(), + }) + } +} + +impl TableDescriptor { + /// Convert to core TableDescriptor + pub fn to_core(&self) -> &fcore::metadata::TableDescriptor { + &self.__tbl_desc + } +} + +/// Information about a Fluss table +#[pyclass] +#[derive(Clone)] +pub struct TableInfo { + __table_info: fcore::metadata::TableInfo, +} + +#[pymethods] +impl TableInfo { + /// Get the table ID + #[getter] + pub fn table_id(&self) -> i64 { + self.__table_info.get_table_id() + } + + /// Get the schema ID + #[getter] + pub fn schema_id(&self) -> i32 { + self.__table_info.get_schema_id() + } + + /// Get the table path + #[getter] + pub fn table_path(&self) -> TablePath { + TablePath::from_core(self.__table_info.get_table_path().clone()) + } + + /// Get the created time + #[getter] + pub fn created_time(&self) -> i64 { + self.__table_info.get_created_time() + } + + /// Get the modified time + #[getter] + pub fn modified_time(&self) -> i64 { + self.__table_info.get_modified_time() + } + + /// Get the primary keys + pub fn get_primary_keys(&self) -> Vec { + self.__table_info.get_primary_keys().clone() + } + + /// Get the bucket keys + pub fn get_bucket_keys(&self) -> Vec { + self.__table_info.get_bucket_keys().to_vec() + } + + /// Get the partition keys + pub fn get_partition_keys(&self) -> Vec { + self.__table_info.get_partition_keys().to_vec() + } + + /// Get number of buckets + #[getter] + pub fn num_buckets(&self) -> i32 { + self.__table_info.get_num_buckets() + } + + /// Check if table has primary key + pub fn has_primary_key(&self) -> bool { + self.__table_info.has_primary_key() + } + + /// Check if table is partitioned + pub fn is_partitioned(&self) -> bool { + self.__table_info.is_partitioned() + } + + /// Get properties + pub fn get_properties(&self) -> std::collections::HashMap { + self.__table_info.get_properties().clone() + } + + /// Get custom properties + pub fn get_custom_properties(&self) -> std::collections::HashMap { + self.__table_info.get_custom_properties().clone() + } + + /// Get comment + #[getter] + pub fn comment(&self) -> Option { + self.__table_info.get_comment().map(|s| s.to_string()) + } + + /// Get the Schema + pub fn get_schema(&self) -> Schema { + Schema { + __schema: self.__table_info.get_schema().clone(), + } + } + + /// Get column names + pub fn get_column_names(&self) -> Vec { + self.__table_info.get_schema().columns().iter() + .map(|col| col.name().to_string()) + .collect() + } + + /// Get column count + pub fn get_column_count(&self) -> usize { + self.__table_info.get_schema().columns().len() + } +} + +impl TableInfo { + /// Create from core TableInfo (internal use) + pub fn from_core(info: fcore::metadata::TableInfo) -> Self { + Self { + __table_info: info, + } + } +} + +/// Represents a lake snapshot with snapshot ID and table bucket offsets +#[pyclass] +#[derive(Clone)] +pub struct LakeSnapshot { + snapshot_id: i64, + table_buckets_offset: HashMap, +} + +/// Represents a table bucket with table ID, partition ID, and bucket ID +#[pyclass] +#[derive(Clone)] +pub struct TableBucket { + table_id: i64, + partition_id: Option, + bucket: i32, +} + +#[pymethods] +impl TableBucket { + /// Create a new TableBucket + #[new] + pub fn new(table_id: i64, bucket: i32) -> Self { + Self { + table_id, + partition_id: None, + bucket, + } + } + + /// Create a new TableBucket with partition + #[staticmethod] + pub fn with_partition(table_id: i64, partition_id: i64, bucket: i32) -> Self { + Self { + table_id, + partition_id: Some(partition_id), + bucket, + } + } + + /// Get table ID + #[getter] + pub fn table_id(&self) -> i64 { + self.table_id + } + + /// Get bucket ID + #[getter] + pub fn bucket_id(&self) -> i32 { + self.bucket + } + + /// Get partition ID + #[getter] + pub fn partition_id(&self) -> Option { + self.partition_id + } + + /// String representation + pub fn __str__(&self) -> String { + if let Some(partition_id) = self.partition_id { + format!("TableBucket(table_id={}, partition_id={}, bucket={})", + self.table_id, partition_id, self.bucket) + } else { + format!("TableBucket(table_id={}, bucket={})", + self.table_id, self.bucket) + } + } + + /// String representation + pub fn __repr__(&self) -> String { + self.__str__() + } + + /// Hash implementation for Python + pub fn __hash__(&self) -> u64 { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let mut hasher = DefaultHasher::new(); + self.table_id.hash(&mut hasher); + self.partition_id.hash(&mut hasher); + self.bucket.hash(&mut hasher); + hasher.finish() + } + + /// Equality implementation for Python + pub fn __eq__(&self, other: &TableBucket) -> bool { + self.table_id == other.table_id + && self.partition_id == other.partition_id + && self.bucket == other.bucket + } +} + +impl TableBucket { + /// Create from core TableBucket (internal use) + pub fn from_core(bucket: fcore::metadata::TableBucket) -> Self { + Self { + table_id: bucket.table_id(), + partition_id: bucket.partition_id(), + bucket: bucket.bucket_id(), + } + } + + /// Convert to core TableBucket (internal use) + pub fn to_core(&self) -> fcore::metadata::TableBucket { + fcore::metadata::TableBucket::new(self.table_id, self.partition_id, self.bucket) + } +} + +#[pymethods] +impl LakeSnapshot { + /// Create a new LakeSnapshot + #[new] + pub fn new(snapshot_id: i64) -> Self { + Self { + snapshot_id, + table_buckets_offset: HashMap::new(), + } + } + + /// Get snapshot ID + #[getter] + pub fn snapshot_id(&self) -> i64 { + self.snapshot_id + } + + /// Get table bucket offsets as a Python dictionary with TableBucket keys + #[getter] + pub fn table_buckets_offset(&self, py: Python) -> PyResult { + let dict = PyDict::new(py); + for (bucket, offset) in &self.table_buckets_offset { + let py_bucket = TableBucket::from_core(bucket.clone()); + dict.set_item(Py::new(py, py_bucket)?, *offset)?; + } + Ok(dict.into()) + } + + /// Get offset for a specific table bucket + pub fn get_bucket_offset(&self, bucket: &TableBucket) -> Option { + let core_bucket = bucket.to_core(); + self.table_buckets_offset.get(&core_bucket).copied() + } + + /// Get all table buckets + pub fn get_table_buckets(&self, py: Python) -> PyResult> { + let mut buckets = Vec::new(); + for bucket in self.table_buckets_offset.keys() { + let py_bucket = TableBucket::from_core(bucket.clone()); + buckets.push(Py::new(py, py_bucket)?.into()); + } + Ok(buckets) + } + + /// String representation + pub fn __str__(&self) -> String { + format!("LakeSnapshot(snapshot_id={}, buckets_count={})", + self.snapshot_id, self.table_buckets_offset.len()) + } + + /// String representation + pub fn __repr__(&self) -> String { + self.__str__() + } +} + +impl LakeSnapshot { + /// Create from core LakeSnapshot (internal use) + pub fn from_core(snapshot: fcore::metadata::LakeSnapshot) -> Self { + Self { + snapshot_id: snapshot.snapshot_id, + table_buckets_offset: snapshot.table_buckets_offset, + } + } +} + From 757a833a868cb3bde4f1504ce73c11c58507985f Mon Sep 17 00:00:00 2001 From: naivedogger <59598718+naivedogger@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:09:53 +0800 Subject: [PATCH 009/287] [feat] Create Python bindings for Fluss Admin (#6) --- fluss-rust/bindings/python/fluss/__init__.py | 20 +++ fluss-rust/bindings/python/src/admin.rs | 107 +++++++++++ fluss-rust/bindings/python/src/connection.rs | 117 ++++++++++++ fluss-rust/bindings/python/src/error.rs | 39 ++++ fluss-rust/bindings/python/src/lib.rs | 67 +++++++ fluss-rust/bindings/python/src/utils.rs | 178 +++++++++++++++++++ 6 files changed, 528 insertions(+) create mode 100644 fluss-rust/bindings/python/fluss/__init__.py create mode 100644 fluss-rust/bindings/python/src/admin.rs create mode 100644 fluss-rust/bindings/python/src/connection.rs create mode 100644 fluss-rust/bindings/python/src/error.rs create mode 100644 fluss-rust/bindings/python/src/lib.rs create mode 100644 fluss-rust/bindings/python/src/utils.rs diff --git a/fluss-rust/bindings/python/fluss/__init__.py b/fluss-rust/bindings/python/fluss/__init__.py new file mode 100644 index 0000000000..cceee102ad --- /dev/null +++ b/fluss-rust/bindings/python/fluss/__init__.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from .fluss_python import * + +__version__ = "0.1.0" diff --git a/fluss-rust/bindings/python/src/admin.rs b/fluss-rust/bindings/python/src/admin.rs new file mode 100644 index 0000000000..7ec6eee93b --- /dev/null +++ b/fluss-rust/bindings/python/src/admin.rs @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::prelude::*; +use pyo3_async_runtimes::tokio::future_into_py; +use crate::*; +use std::sync::Arc; + +/// Administrative client for managing Fluss tables +#[pyclass] +pub struct FlussAdmin { + __admin: Arc, +} + +#[pymethods] +impl FlussAdmin { + /// Create a table with the given schema + #[pyo3(signature = (table_path, table_descriptor, ignore_if_exists=None))] + pub fn create_table<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + table_descriptor: &TableDescriptor, + ignore_if_exists: Option, + ) -> PyResult> { + let ignore = ignore_if_exists.unwrap_or(false); + + let core_table_path = table_path.to_core().clone(); + let core_descriptor = table_descriptor.to_core().clone(); + let admin = self.__admin.clone(); + + future_into_py(py, async move { + admin.create_table(&core_table_path, &core_descriptor, ignore) + .await + .map_err(|e| FlussError::new_err(e.to_string()))?; + + Python::with_gil(|py| Ok(py.None())) + }) + } + + /// Get table information + pub fn get_table<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + ) -> PyResult> { + let core_table_path = table_path.to_core().clone(); + let admin = self.__admin.clone(); + + future_into_py(py, async move { + let core_table_info = admin.get_table(&core_table_path).await + .map_err(|e| FlussError::new_err(format!("Failed to get table: {}", e)))?; + + Python::with_gil(|py| { + let table_info = TableInfo::from_core(core_table_info); + Py::new(py, table_info) + }) + }) + } + + /// Get the latest lake snapshot for a table + pub fn get_latest_lake_snapshot<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + ) -> PyResult> { + let core_table_path = table_path.to_core().clone(); + let admin = self.__admin.clone(); + + future_into_py(py, async move { + let core_lake_snapshot = admin.get_latest_lake_snapshot(&core_table_path).await + .map_err(|e| FlussError::new_err(format!("Failed to get lake snapshot: {}", e)))?; + + Python::with_gil(|py| { + let lake_snapshot = LakeSnapshot::from_core(core_lake_snapshot); + Py::new(py, lake_snapshot) + }) + }) + } + + fn __repr__(&self) -> String { + "FlussAdmin()".to_string() + } +} + +impl FlussAdmin { + // Internal method to create FlussAdmin from core admin + pub fn from_core(admin: fcore::client::FlussAdmin) -> Self { + Self { + __admin: Arc::new(admin), + } + } +} diff --git a/fluss-rust/bindings/python/src/connection.rs b/fluss-rust/bindings/python/src/connection.rs new file mode 100644 index 0000000000..ba1fa50554 --- /dev/null +++ b/fluss-rust/bindings/python/src/connection.rs @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::prelude::*; +use crate::*; +use std::sync::Arc; +use pyo3_async_runtimes::tokio::future_into_py; + +/// Connection to a Fluss cluster +#[pyclass] +pub struct FlussConnection { + inner: Arc, +} + +#[pymethods] +impl FlussConnection { + /// Create a new FlussConnection (async) + #[staticmethod] + fn connect<'py>(py: Python<'py>, config: &Config) -> PyResult> { + let rust_config = config.get_core_config(); + + future_into_py(py, async move { + let connection = fcore::client::FlussConnection::new(rust_config) + .await + .map_err(|e| FlussError::new_err(e.to_string()))?; + + let py_connection = FlussConnection { + inner: Arc::new(connection), + }; + + Python::with_gil(|py| { + Py::new(py, py_connection) + }) + }) + } + + /// Get admin interface + fn get_admin<'py>(&self, py: Python<'py>) -> PyResult> { + let client = self.inner.clone(); + + future_into_py(py, async move { + let admin = client.get_admin() + .await + .map_err(|e| FlussError::new_err(e.to_string()))?; + + let py_admin = FlussAdmin::from_core(admin); + + Python::with_gil(|py| { + Py::new(py, py_admin) + }) + }) + } + + /// Get a table + fn get_table<'py>(&self, py: Python<'py>, table_path: &TablePath) -> PyResult> { + let client = self.inner.clone(); + let core_path = table_path.to_core().clone(); + + future_into_py(py, async move { + let core_table = client.get_table(&core_path) + .await + .map_err(|e| FlussError::new_err(e.to_string()))?; + + let py_table = FlussTable::new_table( + client, + core_table.metadata, + core_table.table_info, + core_table.table_path, + core_table.has_primary_key, + ); + + Python::with_gil(|py| { + Py::new(py, py_table) + }) + }) + } + + // Close the connection + fn close(&mut self) -> PyResult<()> { + Ok(()) + } + + // Enter the runtime context (for 'with' statement) + fn __enter__(slf: PyRef) -> PyRef { + slf + } + + // Exit the runtime context (for 'with' statement) + #[pyo3(signature = (_exc_type=None, _exc_value=None, _traceback=None))] + fn __exit__( + &mut self, + _exc_type: Option, + _exc_value: Option, + _traceback: Option, + ) -> PyResult { + self.close()?; + Ok(false) + } + + fn __repr__(&self) -> String { + "FlussConnection()".to_string() + } +} diff --git a/fluss-rust/bindings/python/src/error.rs b/fluss-rust/bindings/python/src/error.rs new file mode 100644 index 0000000000..2db2991cfb --- /dev/null +++ b/fluss-rust/bindings/python/src/error.rs @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::prelude::*; + +/// Fluss errors +#[pyclass(extends=PyException)] +#[derive(Debug, Clone)] +pub struct FlussError { + #[pyo3(get)] + pub message: String, +} + +#[pymethods] +impl FlussError { + fn __str__(&self) -> String { + format!("FlussError: {}", self.message) + } +} + +impl FlussError { + pub fn new_err(message: impl ToString) -> PyErr { + PyErr::new::(message.to_string()) + } +} \ No newline at end of file diff --git a/fluss-rust/bindings/python/src/lib.rs b/fluss-rust/bindings/python/src/lib.rs new file mode 100644 index 0000000000..0d8b7a5a80 --- /dev/null +++ b/fluss-rust/bindings/python/src/lib.rs @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub use ::fluss as fcore; +use pyo3::prelude::*; +use once_cell::sync::Lazy; +use tokio::runtime::Runtime; + +mod config; +mod connection; +mod table; +mod admin; +mod types; +mod error; +mod utils; + +pub use config::*; +pub use connection::*; +pub use table::*; +pub use admin::*; +pub use types::*; +pub use error::*; +pub use utils::*; + +static TOKIO_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("Failed to create Tokio runtime") +}); + +#[pymodule] +fn fluss_python(m: &Bound<'_, PyModule>) -> PyResult<()> { + // Register all classes + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + // Register exception types + // TODO: maybe implement a separate module for exceptions + m.add("FlussError", m.py().get_type::())?; + + Ok(()) +} diff --git a/fluss-rust/bindings/python/src/utils.rs b/fluss-rust/bindings/python/src/utils.rs new file mode 100644 index 0000000000..c40104bfc9 --- /dev/null +++ b/fluss-rust/bindings/python/src/utils.rs @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::prelude::*; +use arrow::datatypes::{Schema as ArrowSchema, SchemaRef}; +use std::sync::Arc; +use arrow_pyarrow::ToPyArrow; +use crate::*; + +/// Utilities for schema conversion between PyArrow, Arrow, and Fluss +pub struct Utils; + +impl Utils { + /// Convert PyArrow schema to Rust Arrow schema + pub fn pyarrow_to_arrow_schema(py_schema: &PyObject) -> PyResult { + Python::with_gil(|py| { + let schema_bound = py_schema.bind(py); + + let schema: ArrowSchema = arrow_pyarrow::FromPyArrow::from_pyarrow_bound(&schema_bound) + .map_err(|e| FlussError::new_err(format!("Failed to convert PyArrow schema: {}", e)))?; + Ok(Arc::new(schema)) + }) + } + + /// Convert Arrow DataType to Fluss DataType + pub fn arrow_type_to_fluss_type(arrow_type: &arrow::datatypes::DataType) -> PyResult { + use arrow::datatypes::DataType as ArrowDataType; + use fcore::metadata::DataTypes; + + let fluss_type = match arrow_type { + ArrowDataType::Boolean => DataTypes::boolean(), + ArrowDataType::Int8 => DataTypes::tinyint(), + ArrowDataType::Int16 => DataTypes::smallint(), + ArrowDataType::Int32 => DataTypes::int(), + ArrowDataType::Int64 => DataTypes::bigint(), + ArrowDataType::UInt8 => DataTypes::tinyint(), + ArrowDataType::UInt16 => DataTypes::smallint(), + ArrowDataType::UInt32 => DataTypes::int(), + ArrowDataType::UInt64 => DataTypes::bigint(), + ArrowDataType::Float32 => DataTypes::float(), + ArrowDataType::Float64 => DataTypes::double(), + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => DataTypes::string(), + ArrowDataType::Binary | ArrowDataType::LargeBinary => DataTypes::bytes(), + ArrowDataType::Date32 => DataTypes::date(), + ArrowDataType::Date64 => DataTypes::date(), + ArrowDataType::Time32(_) | ArrowDataType::Time64(_) => DataTypes::time(), + ArrowDataType::Timestamp(_, _) => DataTypes::timestamp(), + ArrowDataType::Decimal128(precision, scale) => DataTypes::decimal(*precision as u32, *scale as u32), + _ => { + return Err(FlussError::new_err(format!( + "Unsupported Arrow data type: {:?}", arrow_type + ))); + } + }; + + Ok(fluss_type) + } + + /// Convert Fluss DataType to string representation + pub fn datatype_to_string(data_type: &fcore::metadata::DataType) -> String { + match data_type { + fcore::metadata::DataType::Boolean(_) => "boolean".to_string(), + fcore::metadata::DataType::TinyInt(_) => "tinyint".to_string(), + fcore::metadata::DataType::SmallInt(_) => "smallint".to_string(), + fcore::metadata::DataType::Int(_) => "int".to_string(), + fcore::metadata::DataType::BigInt(_) => "bigint".to_string(), + fcore::metadata::DataType::Float(_) => "float".to_string(), + fcore::metadata::DataType::Double(_) => "double".to_string(), + fcore::metadata::DataType::String(_) => "string".to_string(), + fcore::metadata::DataType::Bytes(_) => "bytes".to_string(), + fcore::metadata::DataType::Date(_) => "date".to_string(), + fcore::metadata::DataType::Time(t) => { + if t.precision() == 0 { + "time".to_string() + } else { + format!("time({})", t.precision()) + } + }, + fcore::metadata::DataType::Timestamp(t) => { + if t.precision() == 6 { + "timestamp".to_string() + } else { + format!("timestamp({})", t.precision()) + } + }, + fcore::metadata::DataType::TimestampLTz(t) => { + if t.precision() == 6 { + "timestamp_ltz".to_string() + } else { + format!("timestamp_ltz({})", t.precision()) + } + }, + fcore::metadata::DataType::Char(c) => format!("char({})", c.length()), + fcore::metadata::DataType::Decimal(d) => format!("decimal({},{})", d.precision(), d.scale()), + fcore::metadata::DataType::Binary(b) => format!("binary({})", b.length()), + fcore::metadata::DataType::Array(arr) => format!("array<{}>", Utils::datatype_to_string(arr.get_element_type())), + fcore::metadata::DataType::Map(map) => format!("map<{},{}>", + Utils::datatype_to_string(map.key_type()), + Utils::datatype_to_string(map.value_type())), + fcore::metadata::DataType::Row(row) => { + let fields: Vec = row.fields().iter() + .map(|field| format!("{}: {}", field.name(), Utils::datatype_to_string(field.data_type()))) + .collect(); + format!("row<{}>", fields.join(", ")) + }, + } + } + + /// Parse log format string to LogFormat enum + pub fn parse_log_format(format_str: &str) -> PyResult { + fcore::metadata::LogFormat::parse(format_str) + .map_err(|e| FlussError::new_err(format!("Invalid log format '{}': {}", format_str, e))) + } + + /// Parse kv format string to KvFormat enum + pub fn parse_kv_format(format_str: &str) -> PyResult { + fcore::metadata::KvFormat::parse(format_str) + .map_err(|e| FlussError::new_err(format!("Invalid kv format '{}': {}", format_str, e))) + } + + /// Convert ScanRecords to Arrow RecordBatch + pub fn convert_scan_records_to_arrow( + _scan_records: fcore::record::ScanRecords, + ) -> Vec> { + let mut result = Vec::new(); + for(_, records) in _scan_records.into_records() { + for record in records { + let columnar_row = record.row(); + let row_id = columnar_row.get_row_id(); + if row_id == 0 { + let record_batch = columnar_row.get_record_batch(); + result.push(record_batch.clone()); + } + } + } + result + } + + /// Combine multiple Arrow batches into a single Table + pub fn combine_batches_to_table(py: Python, batches: Vec>) -> PyResult { + if batches.is_empty() { + return Err(FlussError::new_err("No batches to combine")); + } + + // Convert Rust Arrow RecordBatch to PyObject + let py_batches: Result, _> = batches.iter() + .map(|batch| { + batch.as_ref().to_pyarrow(py) + .map_err(|e| FlussError::new_err(format!("Failed to convert RecordBatch to PyObject: {}", e))) + }) + .collect(); + + let py_batches = py_batches?; + + let pyarrow = py.import("pyarrow")?; + + // Use pyarrow.Table.from_batches to combine batches + let table = pyarrow + .getattr("Table")? + .call_method1("from_batches", (py_batches,))?; + + Ok(table.into()) + } +} From 1d937bbe868719faf49efcf8486a4974223d85ec Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Wed, 15 Oct 2025 18:02:18 +0800 Subject: [PATCH 010/287] [chore] Introduce IT infra and add IT for database operations in admin (#28) --- fluss-rust/.github/workflows/ci.yml | 6 +- fluss-rust/crates/fluss/Cargo.toml | 4 + fluss-rust/crates/fluss/src/client/admin.rs | 5 +- .../crates/fluss/src/metadata/database.rs | 19 +- fluss-rust/crates/fluss/src/metadata/table.rs | 4 +- .../crates/fluss/tests/integration/admin.rs | 131 ++++++++++++ .../fluss/tests/integration/client/mod.rs | 21 -- .../fluss/tests/integration/fluss_cluster.rs | 192 ++++++++++++++++++ fluss-rust/crates/fluss/tests/test_fluss.rs | 4 +- 9 files changed, 347 insertions(+), 39 deletions(-) create mode 100644 fluss-rust/crates/fluss/tests/integration/admin.rs delete mode 100644 fluss-rust/crates/fluss/tests/integration/client/mod.rs create mode 100644 fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs diff --git a/fluss-rust/.github/workflows/ci.yml b/fluss-rust/.github/workflows/ci.yml index 26616292b5..73e2b3f172 100644 --- a/fluss-rust/.github/workflows/ci.yml +++ b/fluss-rust/.github/workflows/ci.yml @@ -88,7 +88,11 @@ jobs: RUST_LOG: DEBUG RUST_BACKTRACE: full - name: Integration Test - run: cargo test --features integration_tests --all-targets --workspace + # only run IT in linux since no docker in macos by default + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cargo test --features integration_tests --all-targets --workspace + fi env: RUST_LOG: DEBUG RUST_BACKTRACE: full \ No newline at end of file diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index cc26014a46..a728bd74f8 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -46,6 +46,10 @@ parse-display = "0.10" ref-cast = "1.0" chrono = { version = "0.4", features = ["clock", "std", "wasmbind"] } +[dev-dependencies] +testcontainers = "0.25.0" +once_cell = "1.19" +test-env-helpers = "0.2.2" [features] integration_tests = [] diff --git a/fluss-rust/crates/fluss/src/client/admin.rs b/fluss-rust/crates/fluss/src/client/admin.rs index 2584034a69..fd0f316374 100644 --- a/fluss-rust/crates/fluss/src/client/admin.rs +++ b/fluss-rust/crates/fluss/src/client/admin.rs @@ -150,7 +150,7 @@ impl FlussAdmin { database_name: &str, ignore_if_not_exists: bool, cascade: bool, - ) -> Result<()> { + ) { let _response = self .admin_gateway .request(DropDatabaseRequest::new( @@ -158,8 +158,7 @@ impl FlussAdmin { ignore_if_not_exists, cascade, )) - .await?; - Ok(()) + .await; } /// List all databases diff --git a/fluss-rust/crates/fluss/src/metadata/database.rs b/fluss-rust/crates/fluss/src/metadata/database.rs index 2649421d6f..8eaa4d3eb0 100644 --- a/fluss-rust/crates/fluss/src/metadata/database.rs +++ b/fluss-rust/crates/fluss/src/metadata/database.rs @@ -22,7 +22,7 @@ use serde::{Deserialize, Serialize}; use serde_json::{Value, json}; use std::collections::HashMap; -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct DatabaseDescriptor { comment: Option, custom_properties: HashMap, @@ -105,11 +105,11 @@ impl DatabaseDescriptorBuilder { self } - pub fn build(self) -> Result { - Ok(DatabaseDescriptor { + pub fn build(self) -> DatabaseDescriptor { + DatabaseDescriptor { comment: self.comment, custom_properties: self.custom_properties, - }) + } } } @@ -179,7 +179,7 @@ impl JsonSerde for DatabaseDescriptor { }; builder = builder.custom_properties(custom_properties); - builder.build() + Ok(builder.build()) } } @@ -187,7 +187,7 @@ impl DatabaseDescriptor { /// Create DatabaseDescriptor from JSON bytes (equivalent to Java's fromJsonBytes) pub fn from_json_bytes(bytes: &[u8]) -> Result { let json_value: Value = serde_json::from_slice(bytes) - .map_err(|e| JsonSerdeError(format!("Failed to parse JSON: {}", e)))?; + .map_err(|e| JsonSerdeError(format!("Failed to parse JSON: {e}")))?; Self::deserialize_json(&json_value) } @@ -195,7 +195,7 @@ impl DatabaseDescriptor { pub fn to_json_bytes(&self) -> Result> { let json_value = self.serialize_json()?; serde_json::to_vec(&json_value) - .map_err(|e| JsonSerdeError(format!("Failed to serialize to JSON: {}", e))) + .map_err(|e| JsonSerdeError(format!("Failed to serialize to JSON: {e}"))) } } @@ -212,8 +212,7 @@ mod tests { let descriptor = DatabaseDescriptor::builder() .comment("Test database") .custom_properties(custom_props) - .build() - .unwrap(); + .build(); // Test serialization let json_bytes = descriptor.to_json_bytes().unwrap(); @@ -226,7 +225,7 @@ mod tests { #[test] fn test_empty_database_descriptor() { - let descriptor = DatabaseDescriptor::builder().build().unwrap(); + let descriptor = DatabaseDescriptor::builder().build(); let json_bytes = descriptor.to_json_bytes().unwrap(); let deserialized = DatabaseDescriptor::from_json_bytes(&json_bytes).unwrap(); assert_eq!(descriptor, deserialized); diff --git a/fluss-rust/crates/fluss/src/metadata/table.rs b/fluss-rust/crates/fluss/src/metadata/table.rs index 90e3573eb8..2b48ec60db 100644 --- a/fluss-rust/crates/fluss/src/metadata/table.rs +++ b/fluss-rust/crates/fluss/src/metadata/table.rs @@ -589,7 +589,7 @@ impl LogFormat { match s.to_uppercase().as_str() { "ARROW" => Ok(LogFormat::ARROW), "INDEXED" => Ok(LogFormat::INDEXED), - _ => Err(InvalidTableError(format!("Unknown log format: {}", s))), + _ => Err(InvalidTableError(format!("Unknown log format: {s}"))), } } } @@ -615,7 +615,7 @@ impl KvFormat { match s.to_uppercase().as_str() { "INDEXED" => Ok(KvFormat::INDEXED), "COMPACTED" => Ok(KvFormat::COMPACTED), - _ => Err(InvalidTableError(format!("Unknown kv format: {}", s))), + _ => Err(InvalidTableError(format!("Unknown kv format: {s}"))), } } } diff --git a/fluss-rust/crates/fluss/tests/integration/admin.rs b/fluss-rust/crates/fluss/tests/integration/admin.rs new file mode 100644 index 0000000000..73f52db936 --- /dev/null +++ b/fluss-rust/crates/fluss/tests/integration/admin.rs @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::integration::fluss_cluster::FlussTestingCluster; +use once_cell::sync::Lazy; +use parking_lot::RwLock; +use std::sync::Arc; + +#[cfg(test)] +use test_env_helpers::*; + +// Module-level shared cluster instance (only for this test file) +static SHARED_FLUSS_CLUSTER: Lazy>>> = + Lazy::new(|| Arc::new(RwLock::new(None))); + +#[cfg(test)] +#[before_all] +#[after_all] +mod admin_test { + use super::SHARED_FLUSS_CLUSTER; + use crate::integration::fluss_cluster::{FlussTestingCluster, FlussTestingClusterBuilder}; + use fluss::metadata::DatabaseDescriptorBuilder; + use std::sync::Arc; + + fn before_all() { + // Create a new tokio runtime in a separate thread + let cluster_guard = SHARED_FLUSS_CLUSTER.clone(); + std::thread::spawn(move || { + let rt = tokio::runtime::Runtime::new().expect("Failed to create runtime"); + rt.block_on(async { + let cluster = FlussTestingClusterBuilder::new().build().await; + let mut guard = cluster_guard.write(); + *guard = Some(cluster); + }); + }) + .join() + .expect("Failed to create cluster"); + } + + fn get_fluss_cluster() -> Arc { + let cluster_guard = SHARED_FLUSS_CLUSTER.read(); + if cluster_guard.is_none() { + panic!("Fluss cluster not initialized. Make sure before_all() was called."); + } + Arc::new(cluster_guard.as_ref().unwrap().clone()) + } + + fn after_all() { + // Create a new tokio runtime in a separate thread + let cluster_guard = SHARED_FLUSS_CLUSTER.clone(); + std::thread::spawn(move || { + let rt = tokio::runtime::Runtime::new().expect("Failed to create runtime"); + rt.block_on(async { + let mut guard = cluster_guard.write(); + if let Some(cluster) = guard.take() { + cluster.stop().await; + } + }); + }) + .join() + .expect("Failed to cleanup cluster"); + } + + #[tokio::test] + async fn test_create_database() { + let cluster = get_fluss_cluster(); + let connection = cluster.get_fluss_connection().await; + + let admin = connection.get_admin().await.expect("should get admin"); + + let db_descriptor = DatabaseDescriptorBuilder::default() + .comment("test_db") + .custom_properties( + [ + ("k1".to_string(), "v1".to_string()), + ("k2".to_string(), "v2".to_string()), + ] + .into(), + ) + .build(); + + let db_name = "test_create_database"; + + assert_eq!(admin.database_exists(db_name).await.unwrap(), false); + + // create database + admin + .create_database(db_name, false, Some(&db_descriptor)) + .await + .expect("should create database"); + + // database should exist + assert_eq!(admin.database_exists(db_name).await.unwrap(), true); + + // get database + let db_info = admin + .get_database_info(db_name) + .await + .expect("should get database info"); + + assert_eq!(db_info.database_name(), db_name); + assert_eq!(db_info.database_descriptor(), &db_descriptor); + + // drop database + admin.drop_database(db_name, false, true).await; + + // database shouldn't exist now + assert_eq!(admin.database_exists(db_name).await.unwrap(), false); + + // Note: We don't stop the shared cluster here as it's used by other tests + } + + #[tokio::test] + async fn test_create_table() { + // todo + } +} diff --git a/fluss-rust/crates/fluss/tests/integration/client/mod.rs b/fluss-rust/crates/fluss/tests/integration/client/mod.rs deleted file mode 100644 index 567c358cea..0000000000 --- a/fluss-rust/crates/fluss/tests/integration/client/mod.rs +++ /dev/null @@ -1,21 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[test] -fn test() { - println!("Running integration tests"); -} diff --git a/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs b/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs new file mode 100644 index 0000000000..83a47956ae --- /dev/null +++ b/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs @@ -0,0 +1,192 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use fluss::client::FlussConnection; +use fluss::config::Config; +use std::collections::HashMap; +use std::string::ToString; +use std::sync::Arc; +use std::time::Duration; +use testcontainers::core::ContainerPort; +use testcontainers::runners::AsyncRunner; +use testcontainers::{ContainerAsync, GenericImage, ImageExt}; + +const FLUSS_VERSION: &str = "0.7.0"; + +pub struct FlussTestingClusterBuilder { + number_of_tablet_servers: usize, + network: &'static str, + cluster_conf: HashMap, +} + +impl FlussTestingClusterBuilder { + pub fn new() -> Self { + // reduce testing resources + let mut cluster_conf = HashMap::new(); + cluster_conf.insert( + "netty.server.num-network-threads".to_string(), + "1".to_string(), + ); + cluster_conf.insert( + "netty.server.num-worker-threads".to_string(), + "3".to_string(), + ); + + FlussTestingClusterBuilder { + number_of_tablet_servers: 1, + cluster_conf, + network: "fluss-cluster-network", + } + } + + pub async fn build(&mut self) -> FlussTestingCluster { + let zookeeper = Arc::new( + GenericImage::new("zookeeper", "3.9.2") + .with_network(self.network) + .with_container_name("zookeeper") + .start() + .await + .unwrap(), + ); + + let coordinator_server = Arc::new(self.start_coordinator_server().await); + + let mut tablet_servers = HashMap::new(); + for server_id in 0..self.number_of_tablet_servers { + tablet_servers.insert( + server_id, + Arc::new(self.start_tablet_server(server_id).await), + ); + } + + FlussTestingCluster { + zookeeper, + coordinator_server, + tablet_servers, + bootstrap_servers: "127.0.0.1:9123".to_string(), + } + } + + async fn start_coordinator_server(&mut self) -> ContainerAsync { + let mut coordinator_confs = HashMap::new(); + coordinator_confs.insert("zookeeper.address", "zookeeper:2181"); + coordinator_confs.insert( + "bind.listeners", + "INTERNAL://coordinator-server:0, CLIENT://coordinator-server:9123", + ); + coordinator_confs.insert("advertised.listeners", "CLIENT://localhost:9123"); + coordinator_confs.insert("internal.listener.name", "INTERNAL"); + GenericImage::new("fluss/fluss", FLUSS_VERSION) + .with_container_name("coordinator-server") + .with_mapped_port(9123, ContainerPort::Tcp(9123)) + .with_network(self.network) + .with_cmd(vec!["coordinatorServer"]) + .with_env_var( + "FLUSS_PROPERTIES", + self.to_fluss_properties_with(coordinator_confs), + ) + .start() + .await + .unwrap() + } + + async fn start_tablet_server(&self, server_id: usize) -> ContainerAsync { + let mut tablet_server_confs = HashMap::new(); + let bind_listeners = format!( + "INTERNAL://tablet-server-{}:0, CLIENT://tablet-server-{}:9123", + server_id, server_id + ); + let expose_host_port = 9124 + server_id; + let advertised_listeners = format!("CLIENT://localhost:{}", expose_host_port); + let tablet_server_id = format!("{}", server_id); + tablet_server_confs.insert("zookeeper.address", "zookeeper:2181"); + tablet_server_confs.insert("bind.listeners", bind_listeners.as_str()); + tablet_server_confs.insert("advertised.listeners", advertised_listeners.as_str()); + tablet_server_confs.insert("internal.listener.name", "INTERNAL"); + tablet_server_confs.insert("tablet-server.id", tablet_server_id.as_str()); + + GenericImage::new("fluss/fluss", FLUSS_VERSION) + .with_cmd(vec!["tabletServer"]) + .with_mapped_port(expose_host_port as u16, ContainerPort::Tcp(9123)) + .with_network(self.network) + .with_container_name(format!("tablet-server-{}", server_id)) + .with_env_var( + "FLUSS_PROPERTIES", + self.to_fluss_properties_with(tablet_server_confs), + ) + .start() + .await + .unwrap() + } + + fn to_fluss_properties_with(&self, extra_properties: HashMap<&str, &str>) -> String { + let mut fluss_properties = Vec::new(); + for (k, v) in self.cluster_conf.iter() { + fluss_properties.push(format!("{}: {}", k, v)); + } + for (k, v) in extra_properties.iter() { + fluss_properties.push(format!("{}: {}", k, v)); + } + fluss_properties.join("\n") + } +} + +/// Provides an easy way to launch a Fluss cluster with coordinator and tablet servers. +#[derive(Clone)] +pub struct FlussTestingCluster { + zookeeper: Arc>, + coordinator_server: Arc>, + tablet_servers: HashMap>>, + bootstrap_servers: String, +} + +impl FlussTestingCluster { + pub async fn stop(&self) { + for tablet_server in self.tablet_servers.values() { + tablet_server.stop().await.unwrap() + } + self.coordinator_server.stop().await.unwrap(); + self.zookeeper.stop().await.unwrap(); + } + + pub async fn get_fluss_connection(&self) -> FlussConnection { + let mut config = Config::default(); + config.bootstrap_server = Some(self.bootstrap_servers.clone()); + + // Retry mechanism: retry for up to 1 minute + let max_retries = 60; // 60 retry attempts + let retry_interval = Duration::from_secs(1); // 1 second interval between retries + + for attempt in 1..=max_retries { + match FlussConnection::new(config.clone()).await { + Ok(connection) => { + return connection; + } + Err(e) => { + if attempt == max_retries { + panic!( + "Failed to connect to Fluss cluster after {} attempts: {}", + max_retries, e + ); + } + tokio::time::sleep(retry_interval).await; + } + } + } + unreachable!() + } +} diff --git a/fluss-rust/crates/fluss/tests/test_fluss.rs b/fluss-rust/crates/fluss/tests/test_fluss.rs index 7840638c74..28b9bef7d9 100644 --- a/fluss-rust/crates/fluss/tests/test_fluss.rs +++ b/fluss-rust/crates/fluss/tests/test_fluss.rs @@ -20,6 +20,6 @@ extern crate fluss; #[cfg(feature = "integration_tests")] mod integration { - - mod client; + mod admin; + mod fluss_cluster; } From 9533ce2e7445e3e6ea5dd0de87f9e2b58c020dc3 Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Thu, 16 Oct 2025 14:17:30 +0800 Subject: [PATCH 011/287] [chore] Supports PyO3 in m1 arm 64 (#33) --- fluss-rust/.cargo/config.toml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 fluss-rust/.cargo/config.toml diff --git a/fluss-rust/.cargo/config.toml b/fluss-rust/.cargo/config.toml new file mode 100644 index 0000000000..57efc7ff75 --- /dev/null +++ b/fluss-rust/.cargo/config.toml @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[target.aarch64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] \ No newline at end of file From a2cc54dd43503588ec6071d1e05f90406854dc0e Mon Sep 17 00:00:00 2001 From: naivedogger <59598718+naivedogger@users.noreply.github.com> Date: Thu, 16 Oct 2025 19:32:50 +0800 Subject: [PATCH 012/287] [feat] Create Python bindings for table writing and reading (#9) --------- Co-authored-by: luoyuxia --- fluss-rust/Cargo.toml | 6 +- fluss-rust/bindings/python/Cargo.toml | 3 +- fluss-rust/bindings/python/fluss/__init__.py | 2 +- fluss-rust/bindings/python/pyproject.toml | 2 +- fluss-rust/bindings/python/src/admin.rs | 26 +- fluss-rust/bindings/python/src/config.rs | 33 +- fluss-rust/bindings/python/src/connection.rs | 45 +- fluss-rust/bindings/python/src/error.rs | 8 +- fluss-rust/bindings/python/src/lib.rs | 23 +- fluss-rust/bindings/python/src/metadata.rs | 119 +++-- fluss-rust/bindings/python/src/table.rs | 412 ++++++++++++++++++ fluss-rust/bindings/python/src/utils.rs | 105 +++-- fluss-rust/crates/fluss/Cargo.toml | 7 +- .../crates/fluss/src/client/table/mod.rs | 20 +- .../crates/fluss/src/client/table/scanner.rs | 103 ++++- .../crates/fluss/src/proto/fluss_api.proto | 20 + fluss-rust/crates/fluss/src/record/mod.rs | 5 + fluss-rust/crates/fluss/src/row/column.rs | 9 + fluss-rust/crates/fluss/src/rpc/api_key.rs | 3 + .../fluss/src/rpc/message/list_offsets.rs | 124 ++++++ .../crates/fluss/src/rpc/message/mod.rs | 2 + 21 files changed, 909 insertions(+), 168 deletions(-) create mode 100644 fluss-rust/bindings/python/src/table.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs diff --git a/fluss-rust/Cargo.toml b/fluss-rust/Cargo.toml index 059236fccf..54436ac17d 100644 --- a/fluss-rust/Cargo.toml +++ b/fluss-rust/Cargo.toml @@ -28,9 +28,11 @@ rust-version = "1.85" [workspace] resolver = "2" -members = ["crates/fluss", "crates/examples"] +members = ["crates/fluss", "crates/examples", "bindings/python"] [workspace.dependencies] fluss = { version = "0.1.0", path = "./crates/fluss" } tokio = { version = "1.44.2", features = ["full"] } -clap = { version = "4.5.37", features = ["derive"] } \ No newline at end of file +clap = { version = "4.5.37", features = ["derive"] } +arrow = "55.1.0" +chrono = { version = "0.4", features = ["clock", "std", "wasmbind"] } diff --git a/fluss-rust/bindings/python/Cargo.toml b/fluss-rust/bindings/python/Cargo.toml index aee1a21314..04826fb289 100644 --- a/fluss-rust/bindings/python/Cargo.toml +++ b/fluss-rust/bindings/python/Cargo.toml @@ -26,8 +26,6 @@ rust-version = "1.85" name = "fluss" crate-type = ["cdylib"] -[workspace] - [dependencies] pyo3 = { version = "0.24", features = ["extension-module"] } fluss = { path = "../../crates/fluss" } @@ -36,3 +34,4 @@ arrow = { workspace = true } arrow-pyarrow = "55.1.0" pyo3-async-runtimes = { version = "0.24.0", features = ["tokio-runtime"] } chrono = { workspace = true } +once_cell = "1.21.3" diff --git a/fluss-rust/bindings/python/fluss/__init__.py b/fluss-rust/bindings/python/fluss/__init__.py index cceee102ad..098014adc6 100644 --- a/fluss-rust/bindings/python/fluss/__init__.py +++ b/fluss-rust/bindings/python/fluss/__init__.py @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. -from .fluss_python import * +from ._fluss import * __version__ = "0.1.0" diff --git a/fluss-rust/bindings/python/pyproject.toml b/fluss-rust/bindings/python/pyproject.toml index fe9d58878d..e28b3d2474 100644 --- a/fluss-rust/bindings/python/pyproject.toml +++ b/fluss-rust/bindings/python/pyproject.toml @@ -57,7 +57,7 @@ docs = [ ] [tool.maturin] -python-source = "python" +python-source = "." module-name = "fluss._fluss" features = ["pyo3/extension-module"] diff --git a/fluss-rust/bindings/python/src/admin.rs b/fluss-rust/bindings/python/src/admin.rs index 7ec6eee93b..73b2dd3af0 100644 --- a/fluss-rust/bindings/python/src/admin.rs +++ b/fluss-rust/bindings/python/src/admin.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use pyo3::prelude::*; -use pyo3_async_runtimes::tokio::future_into_py; use crate::*; +use pyo3_async_runtimes::tokio::future_into_py; use std::sync::Arc; /// Administrative client for managing Fluss tables @@ -38,16 +37,17 @@ impl FlussAdmin { ignore_if_exists: Option, ) -> PyResult> { let ignore = ignore_if_exists.unwrap_or(false); - + let core_table_path = table_path.to_core().clone(); let core_descriptor = table_descriptor.to_core().clone(); let admin = self.__admin.clone(); future_into_py(py, async move { - admin.create_table(&core_table_path, &core_descriptor, ignore) + admin + .create_table(&core_table_path, &core_descriptor, ignore) .await .map_err(|e| FlussError::new_err(e.to_string()))?; - + Python::with_gil(|py| Ok(py.None())) }) } @@ -60,10 +60,12 @@ impl FlussAdmin { ) -> PyResult> { let core_table_path = table_path.to_core().clone(); let admin = self.__admin.clone(); - + future_into_py(py, async move { - let core_table_info = admin.get_table(&core_table_path).await - .map_err(|e| FlussError::new_err(format!("Failed to get table: {}", e)))?; + let core_table_info = admin + .get_table(&core_table_path) + .await + .map_err(|e| FlussError::new_err(format!("Failed to get table: {e}")))?; Python::with_gil(|py| { let table_info = TableInfo::from_core(core_table_info); @@ -80,10 +82,12 @@ impl FlussAdmin { ) -> PyResult> { let core_table_path = table_path.to_core().clone(); let admin = self.__admin.clone(); - + future_into_py(py, async move { - let core_lake_snapshot = admin.get_latest_lake_snapshot(&core_table_path).await - .map_err(|e| FlussError::new_err(format!("Failed to get lake snapshot: {}", e)))?; + let core_lake_snapshot = admin + .get_latest_lake_snapshot(&core_table_path) + .await + .map_err(|e| FlussError::new_err(format!("Failed to get lake snapshot: {e}")))?; Python::with_gil(|py| { let lake_snapshot = LakeSnapshot::from_core(core_lake_snapshot); diff --git a/fluss-rust/bindings/python/src/config.rs b/fluss-rust/bindings/python/src/config.rs index 08b20b4d40..70bd9cd770 100644 --- a/fluss-rust/bindings/python/src/config.rs +++ b/fluss-rust/bindings/python/src/config.rs @@ -15,9 +15,8 @@ // specific language governing permissions and limitations // under the License. -use pyo3::prelude::*; -use pyo3::types::PyDict; use crate::*; +use pyo3::types::PyDict; /// Configuration for Fluss client #[pyclass] @@ -33,7 +32,7 @@ impl Config { #[pyo3(signature = (properties = None))] fn new(properties: Option<&Bound<'_, PyDict>>) -> PyResult { let mut config = fcore::config::Config::default(); - + if let Some(props) = properties { for item in props.iter() { let key: String = item.0.extract()?; @@ -42,67 +41,65 @@ impl Config { match key.as_str() { "bootstrap.servers" => { config.bootstrap_server = Some(value); - }, + } "request.max.size" => { if let Ok(size) = value.parse::() { config.request_max_size = size; } - }, + } "writer.acks" => { config.writer_acks = value; - }, + } "writer.retries" => { if let Ok(retries) = value.parse::() { config.writer_retries = retries; } - }, + } "writer.batch.size" => { if let Ok(size) = value.parse::() { config.writer_batch_size = size; } - }, + } _ => { - return Err(FlussError::new_err(format!("Unknown property: {}", key))); + return Err(FlussError::new_err(format!("Unknown property: {key}"))); } } } } - Ok(Self { - inner: config, - }) + Ok(Self { inner: config }) } - + /// Get the bootstrap server #[getter] fn bootstrap_server(&self) -> Option { self.inner.bootstrap_server.clone() } - + /// Set the bootstrap server #[setter] fn set_bootstrap_server(&mut self, server: String) { self.inner.bootstrap_server = Some(server); } - + /// Get the request max size #[getter] fn request_max_size(&self) -> i32 { self.inner.request_max_size } - + /// Set the request max size #[setter] fn set_request_max_size(&mut self, size: i32) { self.inner.request_max_size = size; } - + /// Get the writer batch size #[getter] fn writer_batch_size(&self) -> i32 { self.inner.writer_batch_size } - + /// Set the writer batch size #[setter] fn set_writer_batch_size(&mut self, size: i32) { diff --git a/fluss-rust/bindings/python/src/connection.rs b/fluss-rust/bindings/python/src/connection.rs index ba1fa50554..aeb8410ddf 100644 --- a/fluss-rust/bindings/python/src/connection.rs +++ b/fluss-rust/bindings/python/src/connection.rs @@ -15,10 +15,9 @@ // specific language governing permissions and limitations // under the License. -use pyo3::prelude::*; use crate::*; -use std::sync::Arc; use pyo3_async_runtimes::tokio::future_into_py; +use std::sync::Arc; /// Connection to a Fluss cluster #[pyclass] @@ -37,55 +36,55 @@ impl FlussConnection { let connection = fcore::client::FlussConnection::new(rust_config) .await .map_err(|e| FlussError::new_err(e.to_string()))?; - + let py_connection = FlussConnection { inner: Arc::new(connection), }; - Python::with_gil(|py| { - Py::new(py, py_connection) - }) + Python::with_gil(|py| Py::new(py, py_connection)) }) } - + /// Get admin interface fn get_admin<'py>(&self, py: Python<'py>) -> PyResult> { let client = self.inner.clone(); future_into_py(py, async move { - let admin = client.get_admin() + let admin = client + .get_admin() .await .map_err(|e| FlussError::new_err(e.to_string()))?; let py_admin = FlussAdmin::from_core(admin); - Python::with_gil(|py| { - Py::new(py, py_admin) - }) + Python::with_gil(|py| Py::new(py, py_admin)) }) } /// Get a table - fn get_table<'py>(&self, py: Python<'py>, table_path: &TablePath) -> PyResult> { + fn get_table<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + ) -> PyResult> { let client = self.inner.clone(); let core_path = table_path.to_core().clone(); future_into_py(py, async move { - let core_table = client.get_table(&core_path) + let core_table = client + .get_table(&core_path) .await .map_err(|e| FlussError::new_err(e.to_string()))?; - + let py_table = FlussTable::new_table( - client, - core_table.metadata, - core_table.table_info, - core_table.table_path, - core_table.has_primary_key, + client.clone(), + core_table.metadata().clone(), + core_table.table_info().clone(), + core_table.table_path().clone(), + core_table.has_primary_key(), ); - Python::with_gil(|py| { - Py::new(py, py_table) - }) + Python::with_gil(|py| Py::new(py, py_table)) }) } @@ -98,7 +97,7 @@ impl FlussConnection { fn __enter__(slf: PyRef) -> PyRef { slf } - + // Exit the runtime context (for 'with' statement) #[pyo3(signature = (_exc_type=None, _exc_value=None, _traceback=None))] fn __exit__( diff --git a/fluss-rust/bindings/python/src/error.rs b/fluss-rust/bindings/python/src/error.rs index 2db2991cfb..35d9d9149f 100644 --- a/fluss-rust/bindings/python/src/error.rs +++ b/fluss-rust/bindings/python/src/error.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use pyo3::exceptions::PyException; use pyo3::prelude::*; /// Fluss errors @@ -27,6 +28,11 @@ pub struct FlussError { #[pymethods] impl FlussError { + #[new] + fn new(message: String) -> Self { + Self { message } + } + fn __str__(&self) -> String { format!("FlussError: {}", self.message) } @@ -36,4 +42,4 @@ impl FlussError { pub fn new_err(message: impl ToString) -> PyErr { PyErr::new::(message.to_string()) } -} \ No newline at end of file +} diff --git a/fluss-rust/bindings/python/src/lib.rs b/fluss-rust/bindings/python/src/lib.rs index 0d8b7a5a80..63e84b1f86 100644 --- a/fluss-rust/bindings/python/src/lib.rs +++ b/fluss-rust/bindings/python/src/lib.rs @@ -16,24 +16,24 @@ // under the License. pub use ::fluss as fcore; -use pyo3::prelude::*; use once_cell::sync::Lazy; +use pyo3::prelude::*; use tokio::runtime::Runtime; +mod admin; mod config; mod connection; -mod table; -mod admin; -mod types; mod error; +mod metadata; +mod table; mod utils; +pub use admin::*; pub use config::*; pub use connection::*; -pub use table::*; -pub use admin::*; -pub use types::*; pub use error::*; +pub use metadata::*; +pub use table::*; pub use utils::*; static TOKIO_RUNTIME: Lazy = Lazy::new(|| { @@ -44,7 +44,7 @@ static TOKIO_RUNTIME: Lazy = Lazy::new(|| { }); #[pymodule] -fn fluss_python(m: &Bound<'_, PyModule>) -> PyResult<()> { +fn _fluss(m: &Bound<'_, PyModule>) -> PyResult<()> { // Register all classes m.add_class::()?; m.add_class::()?; @@ -58,10 +58,9 @@ fn fluss_python(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - + // Register exception types - // TODO: maybe implement a separate module for exceptions - m.add("FlussError", m.py().get_type::())?; - + m.add_class::()?; + Ok(()) } diff --git a/fluss-rust/bindings/python/src/metadata.rs b/fluss-rust/bindings/python/src/metadata.rs index 238dde2ec0..66748ab316 100644 --- a/fluss-rust/bindings/python/src/metadata.rs +++ b/fluss-rust/bindings/python/src/metadata.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -use pyo3::prelude::*; use crate::*; use pyo3::types::PyDict; use std::collections::HashMap; @@ -38,13 +37,13 @@ impl TablePath { table_name, } } - + /// Get the database name #[getter] pub fn database_name(&self) -> String { self.database_name.clone() } - + /// Get the table name #[getter] pub fn table_name(&self) -> String { @@ -59,7 +58,7 @@ impl TablePath { pub fn __str__(&self) -> String { self.table_path_str() } - + fn __repr__(&self) -> String { format!("TablePath('{}', '{}')", self.database_name, self.table_name) } @@ -68,7 +67,7 @@ impl TablePath { pub fn __hash__(&self) -> u64 { use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; - + let mut hasher = DefaultHasher::new(); self.database_name.hash(&mut hasher); self.table_name.hash(&mut hasher); @@ -77,8 +76,7 @@ impl TablePath { /// Equality implementation for Python pub fn __eq__(&self, other: &TablePath) -> bool { - self.database_name == other.database_name - && self.table_name == other.table_name + self.database_name == other.database_name && self.table_name == other.table_name } } @@ -112,27 +110,28 @@ impl Schema { primary_keys: Option>, ) -> PyResult { let arrow_schema = crate::utils::Utils::pyarrow_to_arrow_schema(&schema)?; - + let mut builder = fcore::metadata::Schema::builder(); - + for field in arrow_schema.fields() { let fluss_data_type = crate::utils::Utils::arrow_type_to_fluss_type(field.data_type())?; builder = builder.column(field.name(), fluss_data_type); - + if let Some(comment) = field.metadata().get("comment") { builder = builder.with_comment(comment); } } - + if let Some(pk_columns) = primary_keys { if !pk_columns.is_empty() { builder = builder.primary_key(pk_columns); } } - - let fluss_schema = builder.build() - .map_err(|e| FlussError::new_err(format!("Failed to build schema: {}", e)))?; - + + let fluss_schema = builder + .build() + .map_err(|e| FlussError::new_err(format!("Failed to build schema: {e}")))?; + Ok(Self { __schema: fluss_schema, }) @@ -140,20 +139,33 @@ impl Schema { /// Get column names fn get_column_names(&self) -> Vec { - self.__schema.columns().iter().map(|col| col.name().to_string()).collect() + self.__schema + .columns() + .iter() + .map(|col| col.name().to_string()) + .collect() } /// Get column types fn get_column_types(&self) -> Vec { - self.__schema.columns().iter() + self.__schema + .columns() + .iter() .map(|col| Utils::datatype_to_string(col.data_type())) .collect() } /// Get columns as (name, type) pairs fn get_columns(&self) -> Vec<(String, String)> { - self.__schema.columns().iter() - .map(|col| (col.name().to_string(), Utils::datatype_to_string(col.data_type()))) + self.__schema + .columns() + .iter() + .map(|col| { + ( + col.name().to_string(), + Utils::datatype_to_string(col.data_type()), + ) + }) .collect() } @@ -190,7 +202,6 @@ impl TableDistribution { } } - /// Table descriptor containing schema and metadata #[pyclass] #[derive(Clone)] @@ -204,7 +215,7 @@ impl TableDescriptor { #[new] #[pyo3(signature = (schema, **kwargs))] pub fn new( - schema: &Schema, // fluss schema + schema: &Schema, // fluss schema kwargs: Option<&Bound<'_, PyDict>>, ) -> PyResult { let mut partition_keys = Vec::new(); @@ -237,18 +248,22 @@ impl TableDescriptor { } if let Ok(Some(lformat)) = kwargs.get_item("log_format") { let format_str: String = lformat.extract()?; - log_format = Some(fcore::metadata::LogFormat::parse(&format_str) - .map_err(|e| FlussError::new_err(e.to_string()))?); + log_format = Some( + fcore::metadata::LogFormat::parse(&format_str) + .map_err(|e| FlussError::new_err(e.to_string()))?, + ); } if let Ok(Some(kformat)) = kwargs.get_item("kv_format") { let format_str: String = kformat.extract()?; - kv_format = Some(fcore::metadata::KvFormat::parse(&format_str) - .map_err(|e| FlussError::new_err(e.to_string()))?); + kv_format = Some( + fcore::metadata::KvFormat::parse(&format_str) + .map_err(|e| FlussError::new_err(e.to_string()))?, + ); } } let fluss_schema = schema.to_core().clone(); - + let mut builder = fcore::metadata::TableDescriptor::builder() .schema(fluss_schema) .properties(properties) @@ -266,8 +281,9 @@ impl TableDescriptor { builder = builder.kv_format(kv_format); } - let core_descriptor = builder.build() - .map_err(|e| FlussError::new_err(format!("Failed to build TableDescriptor: {}", e)))?; + let core_descriptor = builder + .build() + .map_err(|e| FlussError::new_err(format!("Failed to build TableDescriptor: {e}")))?; Ok(Self { __tbl_desc: core_descriptor, @@ -303,13 +319,13 @@ impl TableInfo { pub fn table_id(&self) -> i64 { self.__table_info.get_table_id() } - + /// Get the schema ID #[getter] pub fn schema_id(&self) -> i32 { self.__table_info.get_schema_id() } - + /// Get the table path #[getter] pub fn table_path(&self) -> TablePath { @@ -321,13 +337,13 @@ impl TableInfo { pub fn created_time(&self) -> i64 { self.__table_info.get_created_time() } - + /// Get the modified time #[getter] pub fn modified_time(&self) -> i64 { self.__table_info.get_modified_time() } - + /// Get the primary keys pub fn get_primary_keys(&self) -> Vec { self.__table_info.get_primary_keys().clone() @@ -384,7 +400,10 @@ impl TableInfo { /// Get column names pub fn get_column_names(&self) -> Vec { - self.__table_info.get_schema().columns().iter() + self.__table_info + .get_schema() + .columns() + .iter() .map(|col| col.name().to_string()) .collect() } @@ -398,9 +417,7 @@ impl TableInfo { impl TableInfo { /// Create from core TableInfo (internal use) pub fn from_core(info: fcore::metadata::TableInfo) -> Self { - Self { - __table_info: info, - } + Self { __table_info: info } } } @@ -414,7 +431,7 @@ pub struct LakeSnapshot { /// Represents a table bucket with table ID, partition ID, and bucket ID #[pyclass] -#[derive(Clone)] +#[derive(Eq, Hash, PartialEq, Clone)] pub struct TableBucket { table_id: i64, partition_id: Option, @@ -464,11 +481,15 @@ impl TableBucket { /// String representation pub fn __str__(&self) -> String { if let Some(partition_id) = self.partition_id { - format!("TableBucket(table_id={}, partition_id={}, bucket={})", - self.table_id, partition_id, self.bucket) + format!( + "TableBucket(table_id={}, partition_id={}, bucket={})", + self.table_id, partition_id, self.bucket + ) } else { - format!("TableBucket(table_id={}, bucket={})", - self.table_id, self.bucket) + format!( + "TableBucket(table_id={}, bucket={})", + self.table_id, self.bucket + ) } } @@ -481,7 +502,7 @@ impl TableBucket { pub fn __hash__(&self) -> u64 { use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; - + let mut hasher = DefaultHasher::new(); self.table_id.hash(&mut hasher); self.partition_id.hash(&mut hasher); @@ -491,8 +512,8 @@ impl TableBucket { /// Equality implementation for Python pub fn __eq__(&self, other: &TableBucket) -> bool { - self.table_id == other.table_id - && self.partition_id == other.partition_id + self.table_id == other.table_id + && self.partition_id == other.partition_id && self.bucket == other.bucket } } @@ -509,7 +530,7 @@ impl TableBucket { /// Convert to core TableBucket (internal use) pub fn to_core(&self) -> fcore::metadata::TableBucket { - fcore::metadata::TableBucket::new(self.table_id, self.partition_id, self.bucket) + fcore::metadata::TableBucket::new(self.table_id, self.bucket) } } @@ -559,8 +580,11 @@ impl LakeSnapshot { /// String representation pub fn __str__(&self) -> String { - format!("LakeSnapshot(snapshot_id={}, buckets_count={})", - self.snapshot_id, self.table_buckets_offset.len()) + format!( + "LakeSnapshot(snapshot_id={}, buckets_count={})", + self.snapshot_id, + self.table_buckets_offset.len() + ) } /// String representation @@ -578,4 +602,3 @@ impl LakeSnapshot { } } } - diff --git a/fluss-rust/bindings/python/src/table.rs b/fluss-rust/bindings/python/src/table.rs new file mode 100644 index 0000000000..98943b939a --- /dev/null +++ b/fluss-rust/bindings/python/src/table.rs @@ -0,0 +1,412 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::TOKIO_RUNTIME; +use crate::*; +use pyo3_async_runtimes::tokio::future_into_py; +use std::collections::HashSet; +use std::sync::Arc; + +const EARLIEST_OFFSET: i64 = -2; + +/// Represents a Fluss table for data operations +#[pyclass] +pub struct FlussTable { + connection: Arc, + metadata: Arc, + table_info: fcore::metadata::TableInfo, + table_path: fcore::metadata::TablePath, + has_primary_key: bool, +} + +#[pymethods] +impl FlussTable { + /// Create a new append writer for the table + fn new_append_writer<'py>(&self, py: Python<'py>) -> PyResult> { + let conn = self.connection.clone(); + let metadata = self.metadata.clone(); + let table_info = self.table_info.clone(); + + future_into_py(py, async move { + let fluss_table = fcore::client::FlussTable::new(&conn, metadata, table_info); + + let table_append = fluss_table + .new_append() + .map_err(|e| FlussError::new_err(e.to_string()))?; + + let rust_writer = table_append.create_writer(); + + let py_writer = AppendWriter::from_core(rust_writer); + + Python::with_gil(|py| Py::new(py, py_writer)) + }) + } + + /// Create a new log scanner for the table + fn new_log_scanner<'py>(&self, py: Python<'py>) -> PyResult> { + let conn = self.connection.clone(); + let metadata = self.metadata.clone(); + let table_info = self.table_info.clone(); + + future_into_py(py, async move { + let fluss_table = + fcore::client::FlussTable::new(&conn, metadata.clone(), table_info.clone()); + + let table_scan = fluss_table.new_scan(); + + let rust_scanner = table_scan.create_log_scanner(); + + let py_scanner = LogScanner::from_core(rust_scanner, table_info.clone()); + + Python::with_gil(|py| Py::new(py, py_scanner)) + }) + } + + /// Get table information + pub fn get_table_info(&self) -> TableInfo { + TableInfo::from_core(self.table_info.clone()) + } + + /// Get table path + pub fn get_table_path(&self) -> TablePath { + TablePath::from_core(self.table_path.clone()) + } + + /// Check if table has primary key + pub fn has_primary_key(&self) -> bool { + self.has_primary_key + } + + fn __repr__(&self) -> String { + format!( + "FlussTable(path={}.{})", + self.table_path.database(), + self.table_path.table() + ) + } +} + +impl FlussTable { + /// Create a FlussTable + pub fn new_table( + connection: Arc, + metadata: Arc, + table_info: fcore::metadata::TableInfo, + table_path: fcore::metadata::TablePath, + has_primary_key: bool, + ) -> Self { + Self { + connection, + metadata, + table_info, + table_path, + has_primary_key, + } + } +} + +/// Writer for appending data to a Fluss table +#[pyclass] +pub struct AppendWriter { + inner: fcore::client::AppendWriter, +} + +#[pymethods] +impl AppendWriter { + /// Write Arrow table data + pub fn write_arrow(&mut self, py: Python, table: PyObject) -> PyResult<()> { + // Convert Arrow Table to batches and write each batch + let batches = table.call_method0(py, "to_batches")?; + let batch_list: Vec = batches.extract(py)?; + + for batch in batch_list { + self.write_arrow_batch(py, batch)?; + } + Ok(()) + } + + /// Write Arrow batch data + pub fn write_arrow_batch(&mut self, py: Python, batch: PyObject) -> PyResult<()> { + // Extract number of rows and columns from the Arrow batch + let num_rows: usize = batch.getattr(py, "num_rows")?.extract(py)?; + let num_columns: usize = batch.getattr(py, "num_columns")?.extract(py)?; + + // Process each row in the batch + for row_idx in 0..num_rows { + let mut generic_row = fcore::row::GenericRow::new(); + + // Extract values for each column in this row + for col_idx in 0..num_columns { + let column = batch.call_method1(py, "column", (col_idx,))?; + let value = column.call_method1(py, "__getitem__", (row_idx,))?; + + // Convert the Python value to a Datum and add to the row + let datum = self.convert_python_value_to_datum(py, value)?; + generic_row.set_field(col_idx, datum); + } + + // Append this row using the async append method + TOKIO_RUNTIME.block_on(async { + self.inner + .append(generic_row) + .await + .map_err(|e| FlussError::new_err(e.to_string())) + })?; + } + + Ok(()) + } + + /// Write Pandas DataFrame data + pub fn write_pandas(&mut self, py: Python, df: PyObject) -> PyResult<()> { + // Import pyarrow module + let pyarrow = py.import("pyarrow")?; + + // Get the Table class from pyarrow module + let table_class = pyarrow.getattr("Table")?; + + // Call Table.from_pandas(df) - from_pandas is a class method + let pa_table = table_class.call_method1("from_pandas", (df,))?; + + // Then call write_arrow with the converted table + self.write_arrow(py, pa_table.into()) + } + + /// Flush any pending data + pub fn flush(&mut self) -> PyResult<()> { + TOKIO_RUNTIME.block_on(async { + self.inner + .flush() + .await + .map_err(|e| FlussError::new_err(e.to_string())) + }) + } + + fn __repr__(&self) -> String { + "AppendWriter()".to_string() + } +} + +impl AppendWriter { + /// Create a AppendWriter from a core append writer + pub fn from_core(append: fcore::client::AppendWriter) -> Self { + Self { inner: append } + } + + fn convert_python_value_to_datum( + &self, + py: Python, + value: PyObject, + ) -> PyResult> { + use fcore::row::{Blob, Datum, F32, F64}; + + // Check for None (null) + if value.is_none(py) { + return Ok(Datum::Null); + } + + // Try to extract different types + if let Ok(type_name) = value.bind(py).get_type().name() { + if type_name == "StringScalar" { + if let Ok(py_value) = value.call_method0(py, "as_py") { + if let Ok(str_val) = py_value.extract::(py) { + let leaked_str: &'static str = Box::leak(str_val.into_boxed_str()); + return Ok(Datum::String(leaked_str)); + } + } + } + } + + if let Ok(bool_val) = value.extract::(py) { + return Ok(Datum::Bool(bool_val)); + } + + if let Ok(int_val) = value.extract::(py) { + return Ok(Datum::Int32(int_val)); + } + + if let Ok(int_val) = value.extract::(py) { + return Ok(Datum::Int64(int_val)); + } + + if let Ok(float_val) = value.extract::(py) { + return Ok(Datum::Float32(F32::from(float_val))); + } + + if let Ok(float_val) = value.extract::(py) { + return Ok(Datum::Float64(F64::from(float_val))); + } + + if let Ok(str_val) = value.extract::(py) { + // Convert String to &'static str by leaking memory + // This is a simplified approach - in production, you might want better lifetime management + let leaked_str: &'static str = Box::leak(str_val.into_boxed_str()); + return Ok(Datum::String(leaked_str)); + } + + if let Ok(bytes_val) = value.extract::>(py) { + let blob = Blob::from(bytes_val); + return Ok(Datum::Blob(blob)); + } + + // If we can't convert, return an error + let type_name = value.bind(py).get_type().name()?; + Err(FlussError::new_err(format!( + "Cannot convert Python value to Datum: {type_name:?}" + ))) + } +} + +/// Scanner for reading log data from a Fluss table +#[pyclass] +pub struct LogScanner { + inner: fcore::client::LogScanner, + table_info: fcore::metadata::TableInfo, + #[allow(dead_code)] + start_timestamp: Option, + #[allow(dead_code)] + end_timestamp: Option, +} + +#[pymethods] +impl LogScanner { + /// Subscribe to log data with timestamp range + fn subscribe( + &mut self, + _start_timestamp: Option, + _end_timestamp: Option, + ) -> PyResult<()> { + if _start_timestamp.is_some() { + return Err(FlussError::new_err( + "Specifying start_timestamp is not yet supported. Please use None.".to_string(), + )); + } + if _end_timestamp.is_some() { + return Err(FlussError::new_err( + "Specifying end_timestamp is not yet supported. Please use None.".to_string(), + )); + } + + let num_buckets = self.table_info.get_num_buckets(); + for bucket_id in 0..num_buckets { + let start_offset = EARLIEST_OFFSET; + + TOKIO_RUNTIME.block_on(async { + self.inner + .subscribe(bucket_id, start_offset) + .await + .map_err(|e| FlussError::new_err(e.to_string())) + })?; + } + + Ok(()) + } + + /// Convert all data to Arrow Table + fn to_arrow(&self, py: Python) -> PyResult { + use std::collections::HashMap; + use std::time::Duration; + + let mut all_batches = Vec::new(); + + let num_buckets = self.table_info.get_num_buckets(); + let bucket_ids: Vec = (0..num_buckets).collect(); + + // todo: after supporting list_offsets with timestamp, we can use start_timestamp and end_timestamp here + let target_offsets: HashMap = TOKIO_RUNTIME + .block_on(async { self.inner.list_offsets_latest(bucket_ids).await }) + .map_err(|e| FlussError::new_err(e.to_string()))?; + + let mut current_offsets: HashMap = HashMap::new(); + let mut completed_buckets: HashSet = HashSet::new(); + + if !target_offsets.is_empty() { + loop { + let batch_result = TOKIO_RUNTIME + .block_on(async { self.inner.poll(Duration::from_millis(500)).await }); + + match batch_result { + Ok(scan_records) => { + let mut filtered_records: HashMap< + fcore::metadata::TableBucket, + Vec, + > = HashMap::new(); + for (bucket, records) in scan_records.records_by_buckets() { + let bucket_id = bucket.bucket_id(); + if completed_buckets.contains(&bucket_id) { + continue; + } + if let Some(last_record) = records.last() { + let offset = last_record.offset(); + current_offsets.insert(bucket_id, offset); + filtered_records.insert(bucket.clone(), records.clone()); + if offset >= target_offsets[&bucket_id] - 1 { + completed_buckets.insert(bucket_id); + } + } + } + + if !filtered_records.is_empty() { + let filtered_scan_records = + fcore::record::ScanRecords::new(filtered_records); + let arrow_batch = + Utils::convert_scan_records_to_arrow(filtered_scan_records); + all_batches.extend(arrow_batch); + } + + // completed bucket is equal to all target buckets, + // we can break scan records + if completed_buckets.len() == target_offsets.len() { + break; + } + } + Err(e) => return Err(FlussError::new_err(e.to_string())), + } + } + } + + Utils::combine_batches_to_table(py, all_batches) + } + + /// Convert all data to Pandas DataFrame + fn to_pandas(&self, py: Python) -> PyResult { + let arrow_table = self.to_arrow(py)?; + + // Convert Arrow Table to Pandas DataFrame using pyarrow + let df = arrow_table.call_method0(py, "to_pandas")?; + Ok(df) + } + + fn __repr__(&self) -> String { + format!("LogScanner(table={})", self.table_info.table_path) + } +} + +impl LogScanner { + /// Create LogScanner from core LogScanner + pub fn from_core( + inner: fcore::client::LogScanner, + table_info: fcore::metadata::TableInfo, + ) -> Self { + Self { + inner, + table_info, + start_timestamp: None, + end_timestamp: None, + } + } +} diff --git a/fluss-rust/bindings/python/src/utils.rs b/fluss-rust/bindings/python/src/utils.rs index c40104bfc9..9642e9d95b 100644 --- a/fluss-rust/bindings/python/src/utils.rs +++ b/fluss-rust/bindings/python/src/utils.rs @@ -15,11 +15,10 @@ // specific language governing permissions and limitations // under the License. -use pyo3::prelude::*; +use crate::*; use arrow::datatypes::{Schema as ArrowSchema, SchemaRef}; -use std::sync::Arc; use arrow_pyarrow::ToPyArrow; -use crate::*; +use std::sync::Arc; /// Utilities for schema conversion between PyArrow, Arrow, and Fluss pub struct Utils; @@ -29,15 +28,19 @@ impl Utils { pub fn pyarrow_to_arrow_schema(py_schema: &PyObject) -> PyResult { Python::with_gil(|py| { let schema_bound = py_schema.bind(py); - - let schema: ArrowSchema = arrow_pyarrow::FromPyArrow::from_pyarrow_bound(&schema_bound) - .map_err(|e| FlussError::new_err(format!("Failed to convert PyArrow schema: {}", e)))?; + + let schema: ArrowSchema = arrow_pyarrow::FromPyArrow::from_pyarrow_bound(schema_bound) + .map_err(|e| { + FlussError::new_err(format!("Failed to convert PyArrow schema: {e}")) + })?; Ok(Arc::new(schema)) }) } /// Convert Arrow DataType to Fluss DataType - pub fn arrow_type_to_fluss_type(arrow_type: &arrow::datatypes::DataType) -> PyResult { + pub fn arrow_type_to_fluss_type( + arrow_type: &arrow::datatypes::DataType, + ) -> PyResult { use arrow::datatypes::DataType as ArrowDataType; use fcore::metadata::DataTypes; @@ -59,10 +62,12 @@ impl Utils { ArrowDataType::Date64 => DataTypes::date(), ArrowDataType::Time32(_) | ArrowDataType::Time64(_) => DataTypes::time(), ArrowDataType::Timestamp(_, _) => DataTypes::timestamp(), - ArrowDataType::Decimal128(precision, scale) => DataTypes::decimal(*precision as u32, *scale as u32), + ArrowDataType::Decimal128(precision, scale) => { + DataTypes::decimal(*precision as u32, *scale as u32) + } _ => { return Err(FlussError::new_err(format!( - "Unsupported Arrow data type: {:?}", arrow_type + "Unsupported Arrow data type: {arrow_type:?}" ))); } }; @@ -89,47 +94,62 @@ impl Utils { } else { format!("time({})", t.precision()) } - }, + } fcore::metadata::DataType::Timestamp(t) => { if t.precision() == 6 { "timestamp".to_string() } else { format!("timestamp({})", t.precision()) } - }, + } fcore::metadata::DataType::TimestampLTz(t) => { if t.precision() == 6 { "timestamp_ltz".to_string() } else { format!("timestamp_ltz({})", t.precision()) } - }, + } fcore::metadata::DataType::Char(c) => format!("char({})", c.length()), - fcore::metadata::DataType::Decimal(d) => format!("decimal({},{})", d.precision(), d.scale()), + fcore::metadata::DataType::Decimal(d) => { + format!("decimal({},{})", d.precision(), d.scale()) + } fcore::metadata::DataType::Binary(b) => format!("binary({})", b.length()), - fcore::metadata::DataType::Array(arr) => format!("array<{}>", Utils::datatype_to_string(arr.get_element_type())), - fcore::metadata::DataType::Map(map) => format!("map<{},{}>", - Utils::datatype_to_string(map.key_type()), - Utils::datatype_to_string(map.value_type())), + fcore::metadata::DataType::Array(arr) => format!( + "array<{}>", + Utils::datatype_to_string(arr.get_element_type()) + ), + fcore::metadata::DataType::Map(map) => format!( + "map<{},{}>", + Utils::datatype_to_string(map.key_type()), + Utils::datatype_to_string(map.value_type()) + ), fcore::metadata::DataType::Row(row) => { - let fields: Vec = row.fields().iter() - .map(|field| format!("{}: {}", field.name(), Utils::datatype_to_string(field.data_type()))) + let fields: Vec = row + .fields() + .iter() + .map(|field| { + format!( + "{}: {}", + field.name(), + Utils::datatype_to_string(field.data_type()) + ) + }) .collect(); format!("row<{}>", fields.join(", ")) - }, + } } } /// Parse log format string to LogFormat enum pub fn parse_log_format(format_str: &str) -> PyResult { fcore::metadata::LogFormat::parse(format_str) - .map_err(|e| FlussError::new_err(format!("Invalid log format '{}': {}", format_str, e))) + .map_err(|e| FlussError::new_err(format!("Invalid log format '{format_str}': {e}"))) } /// Parse kv format string to KvFormat enum pub fn parse_kv_format(format_str: &str) -> PyResult { fcore::metadata::KvFormat::parse(format_str) - .map_err(|e| FlussError::new_err(format!("Invalid kv format '{}': {}", format_str, e))) + .map_err(|e| FlussError::new_err(format!("Invalid kv format '{format_str}': {e}"))) } /// Convert ScanRecords to Arrow RecordBatch @@ -137,42 +157,41 @@ impl Utils { _scan_records: fcore::record::ScanRecords, ) -> Vec> { let mut result = Vec::new(); - for(_, records) in _scan_records.into_records() { - for record in records { - let columnar_row = record.row(); - let row_id = columnar_row.get_row_id(); - if row_id == 0 { - let record_batch = columnar_row.get_record_batch(); - result.push(record_batch.clone()); - } + for record in _scan_records { + let columnar_row = record.row(); + let row_id = columnar_row.get_row_id(); + if row_id == 0 { + let record_batch = columnar_row.get_record_batch(); + result.push(Arc::new(record_batch.clone())); } } result } - + /// Combine multiple Arrow batches into a single Table - pub fn combine_batches_to_table(py: Python, batches: Vec>) -> PyResult { - if batches.is_empty() { - return Err(FlussError::new_err("No batches to combine")); - } - + pub fn combine_batches_to_table( + py: Python, + batches: Vec>, + ) -> PyResult { // Convert Rust Arrow RecordBatch to PyObject - let py_batches: Result, _> = batches.iter() + let py_batches: Result, _> = batches + .iter() .map(|batch| { - batch.as_ref().to_pyarrow(py) - .map_err(|e| FlussError::new_err(format!("Failed to convert RecordBatch to PyObject: {}", e))) + batch.as_ref().to_pyarrow(py).map_err(|e| { + FlussError::new_err(format!("Failed to convert RecordBatch to PyObject: {e}")) + }) }) .collect(); - + let py_batches = py_batches?; - + let pyarrow = py.import("pyarrow")?; - + // Use pyarrow.Table.from_batches to combine batches let table = pyarrow .getattr("Table")? .call_method1("from_batches", (py_batches,))?; - + Ok(table.into()) } } diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index a728bd74f8..ab1efc26d2 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -23,7 +23,7 @@ name = "fluss" build = "src/build.rs" [dependencies] -arrow = "55.1.0" +arrow = { workspace = true } arrow-schema = "55.1.0" byteorder = "1.5" futures = "0.3" @@ -44,7 +44,8 @@ rust_decimal = "1" ordered-float = { version = "4", features = ["serde"] } parse-display = "0.10" ref-cast = "1.0" -chrono = { version = "0.4", features = ["clock", "std", "wasmbind"] } +chrono = { workspace = true } +oneshot = "0.1.11" [dev-dependencies] testcontainers = "0.25.0" @@ -56,4 +57,4 @@ integration_tests = [] [build-dependencies] -prost-build = { version = "0.13.5" } \ No newline at end of file +prost-build = { version = "0.13.5" } diff --git a/fluss-rust/crates/fluss/src/client/table/mod.rs b/fluss-rust/crates/fluss/src/client/table/mod.rs index 4d6f8f045b..07e64948fa 100644 --- a/fluss-rust/crates/fluss/src/client/table/mod.rs +++ b/fluss-rust/crates/fluss/src/client/table/mod.rs @@ -27,8 +27,8 @@ mod append; mod scanner; mod writer; -pub use append::TableAppend; -pub use scanner::TableScan; +pub use append::{AppendWriter, TableAppend}; +pub use scanner::{LogScanner, TableScan}; #[allow(dead_code)] pub struct FlussTable<'a> { @@ -65,6 +65,22 @@ impl<'a> FlussTable<'a> { pub fn new_scan(&self) -> TableScan<'_> { TableScan::new(self.conn, self.table_info.clone(), self.metadata.clone()) } + + pub fn metadata(&self) -> &Arc { + &self.metadata + } + + pub fn table_info(&self) -> &TableInfo { + &self.table_info + } + + pub fn table_path(&self) -> &TablePath { + &self.table_path + } + + pub fn has_primary_key(&self) -> bool { + self.has_primary_key + } } impl<'a> Drop for FlussTable<'a> { diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index 41fb17e8c8..cbe724896e 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -22,12 +22,14 @@ use crate::metadata::{TableBucket, TableInfo, TablePath}; use crate::proto::{FetchLogRequest, PbFetchLogReqForBucket, PbFetchLogReqForTable}; use crate::record::{LogRecordsBatchs, ReadContext, ScanRecord, ScanRecords, to_arrow_schema}; use crate::rpc::RpcClient; +use crate::rpc::message::{ListOffsetsRequest, OffsetSpec}; use crate::util::FairBucketStatusMap; use parking_lot::RwLock; use std::collections::HashMap; use std::slice::from_ref; use std::sync::Arc; use std::time::Duration; +use tokio::task::JoinHandle; const LOG_FETCH_MAX_BYTES: i32 = 16 * 1024 * 1024; #[allow(dead_code)] @@ -65,6 +67,7 @@ pub struct LogScanner { metadata: Arc, log_scanner_status: Arc, log_fetcher: LogFetcher, + conns: Arc, } impl LogScanner { @@ -81,10 +84,11 @@ impl LogScanner { log_scanner_status: log_scanner_status.clone(), log_fetcher: LogFetcher::new( table_info.clone(), - connections, + connections.clone(), metadata.clone(), log_scanner_status.clone(), ), + conns: connections.clone(), } } @@ -102,6 +106,103 @@ impl LogScanner { Ok(()) } + pub async fn list_offsets_latest(&self, buckets: Vec) -> Result> { + // TODO: support partition_id + let partition_id = None; + let offset_spec = OffsetSpec::Latest; + + self.metadata + .check_and_update_table_metadata(from_ref(&self.table_path)) + .await?; + + let cluster = self.metadata.get_cluster(); + let table_id = cluster.get_table(&self.table_path).table_id; + + // Prepare requests + let requests_by_server = self.prepare_list_offsets_requests( + table_id, + partition_id, + buckets.clone(), + offset_spec, + )?; + + // Send Requests + let response_futures = self.send_list_offsets_request(requests_by_server).await?; + + let mut results = HashMap::new(); + + for response_future in response_futures { + let offsets = response_future.await.map_err( + // todo: consider use suitable error + |e| crate::error::Error::WriteError(format!("Fail to get result: {e}")), + )?; + results.extend(offsets?); + } + Ok(results) + } + + fn prepare_list_offsets_requests( + &self, + table_id: i64, + partition_id: Option, + buckets: Vec, + offset_spec: OffsetSpec, + ) -> Result> { + let cluster = self.metadata.get_cluster(); + let mut node_for_bucket_list: HashMap> = HashMap::new(); + + for bucket_id in buckets { + let table_bucket = TableBucket::new(table_id, bucket_id); + let leader = cluster.leader_for(&table_bucket).ok_or_else(|| { + // todo: consider use another suitable error + crate::error::Error::InvalidTableError(format!( + "No leader found for table bucket: table_id={table_id}, bucket_id={bucket_id}" + )) + })?; + + node_for_bucket_list + .entry(leader.id()) + .or_default() + .push(bucket_id); + } + + let mut list_offsets_requests = HashMap::new(); + for (leader_id, bucket_ids) in node_for_bucket_list { + let request = + ListOffsetsRequest::new(table_id, partition_id, bucket_ids, offset_spec.clone()); + list_offsets_requests.insert(leader_id, request); + } + Ok(list_offsets_requests) + } + + async fn send_list_offsets_request( + &self, + request_map: HashMap, + ) -> Result>>>> { + let mut tasks = Vec::new(); + + for (leader_id, request) in request_map { + let rpc_client = self.conns.clone(); + let metadata = self.metadata.clone(); + + let task = tokio::spawn(async move { + let cluster = metadata.get_cluster(); + let tablet_server = cluster.get_tablet_server(leader_id).ok_or_else(|| { + // todo: consider use more suitable error + crate::error::Error::InvalidTableError(format!( + "Tablet server {leader_id} not found" + )) + })?; + let connection = rpc_client.get_connection(tablet_server).await?; + let list_offsets_response = connection.request(request).await?; + list_offsets_response.offsets() + }); + tasks.push(task); + } + + Ok(tasks) + } + async fn poll_for_fetches(&self) -> Result>> { self.log_fetcher.send_fetches_and_collect().await } diff --git a/fluss-rust/crates/fluss/src/proto/fluss_api.proto b/fluss-rust/crates/fluss/src/proto/fluss_api.proto index d71197b2b0..ef460fc559 100644 --- a/fluss-rust/crates/fluss/src/proto/fluss_api.proto +++ b/fluss-rust/crates/fluss/src/proto/fluss_api.proto @@ -202,6 +202,19 @@ message ListDatabasesResponse { repeated string database_name = 1; } +// list offsets request and response +message ListOffsetsRequest { + required int32 follower_server_id = 1; // value -1 indicate the request from client. + required int32 offset_type = 2; // value can be 0,1,2 (see ListOffsetsParam for more details) + required int64 table_id = 3; + optional int64 partition_id = 4; + repeated int32 bucket_id = 5 [packed = true]; // it is recommended to use packed for repeated numerics to get more efficient encoding + optional int64 startTimestamp = 6; +} +message ListOffsetsResponse { + repeated PbListOffsetsRespForBucket buckets_resp = 1; +} + // fetch log request and response message FetchLogRequest { @@ -262,6 +275,13 @@ message PbRemoteLogSegment { required int32 segment_size_in_bytes = 4; } +message PbListOffsetsRespForBucket { + required int32 bucket_id = 1; + optional int32 error_code = 2; + optional string error_message = 3; + optional int64 offset = 4; +} + // fetch latest lake snapshot message GetLatestLakeSnapshotRequest { required PbTablePath table_path = 1; diff --git a/fluss-rust/crates/fluss/src/record/mod.rs b/fluss-rust/crates/fluss/src/record/mod.rs index d7872055b9..07fbe0808c 100644 --- a/fluss-rust/crates/fluss/src/record/mod.rs +++ b/fluss-rust/crates/fluss/src/record/mod.rs @@ -84,6 +84,7 @@ impl fmt::Display for ChangeType { } } +#[derive(Clone)] pub struct ScanRecord { pub row: ColumnarRow, offset: i64, @@ -158,6 +159,10 @@ impl ScanRecords { pub fn is_empty(&self) -> bool { self.records.is_empty() } + + pub fn records_by_buckets(&self) -> &HashMap> { + &self.records + } } impl IntoIterator for ScanRecords { diff --git a/fluss-rust/crates/fluss/src/row/column.rs b/fluss-rust/crates/fluss/src/row/column.rs index 44ca640b51..6d47836d9b 100644 --- a/fluss-rust/crates/fluss/src/row/column.rs +++ b/fluss-rust/crates/fluss/src/row/column.rs @@ -22,6 +22,7 @@ use arrow::array::{ }; use std::sync::Arc; +#[derive(Clone)] pub struct ColumnarRow { record_batch: Arc, row_id: usize, @@ -45,6 +46,14 @@ impl ColumnarRow { pub fn set_row_id(&mut self, row_id: usize) { self.row_id = row_id } + + pub fn get_row_id(&self) -> usize { + self.row_id + } + + pub fn get_record_batch(&self) -> &RecordBatch { + &self.record_batch + } } impl InternalRow for ColumnarRow { diff --git a/fluss-rust/crates/fluss/src/rpc/api_key.rs b/fluss-rust/crates/fluss/src/rpc/api_key.rs index 18ce44fbef..215bb39389 100644 --- a/fluss-rust/crates/fluss/src/rpc/api_key.rs +++ b/fluss-rust/crates/fluss/src/rpc/api_key.rs @@ -31,6 +31,7 @@ pub enum ApiKey { MetaData, ProduceLog, FetchLog, + ListOffsets, GetDatabaseInfo, GetLatestLakeSnapshot, Unknown(i16), @@ -51,6 +52,7 @@ impl From for ApiKey { 1012 => ApiKey::MetaData, 1014 => ApiKey::ProduceLog, 1015 => ApiKey::FetchLog, + 1021 => ApiKey::ListOffsets, 1032 => ApiKey::GetLatestLakeSnapshot, 1035 => ApiKey::GetDatabaseInfo, _ => Unknown(key), @@ -73,6 +75,7 @@ impl From for i16 { ApiKey::MetaData => 1012, ApiKey::ProduceLog => 1014, ApiKey::FetchLog => 1015, + ApiKey::ListOffsets => 1021, ApiKey::GetLatestLakeSnapshot => 1032, ApiKey::GetDatabaseInfo => 1035, Unknown(x) => x, diff --git a/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs b/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs new file mode 100644 index 0000000000..500db33e8c --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{impl_read_version_type, impl_write_version_type, proto}; + +use crate::error::Error; +use crate::error::Result as FlussResult; +use crate::proto::ListOffsetsResponse; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; +use std::collections::HashMap; + +use bytes::{Buf, BufMut}; +use prost::Message; + +/// Offset type constants as per proto comments +pub const LIST_EARLIEST_OFFSET: i32 = 0; +pub const LIST_LATEST_OFFSET: i32 = 1; +pub const LIST_OFFSET_FROM_TIMESTAMP: i32 = 2; + +/// Client follower server id constant +pub const CLIENT_FOLLOWER_SERVER_ID: i32 = -1; + +/// Offset specification for list offsets request +#[derive(Debug, Clone)] +pub enum OffsetSpec { + /// Earliest offset spec + Earliest, + /// Latest offset spec + Latest, + /// Timestamp offset spec + Timestamp(i64), +} + +impl OffsetSpec { + pub fn offset_type(&self) -> i32 { + match self { + OffsetSpec::Earliest => LIST_EARLIEST_OFFSET, + OffsetSpec::Latest => LIST_LATEST_OFFSET, + OffsetSpec::Timestamp(_) => LIST_OFFSET_FROM_TIMESTAMP, + } + } + + pub fn start_timestamp(&self) -> Option { + match self { + OffsetSpec::Timestamp(ts) => Some(*ts), + _ => None, + } + } +} + +#[derive(Debug)] +pub struct ListOffsetsRequest { + pub inner_request: proto::ListOffsetsRequest, +} + +impl ListOffsetsRequest { + pub fn new( + table_id: i64, + partition_id: Option, + bucket_ids: Vec, + offset_spec: OffsetSpec, + ) -> Self { + ListOffsetsRequest { + inner_request: proto::ListOffsetsRequest { + follower_server_id: CLIENT_FOLLOWER_SERVER_ID, + offset_type: offset_spec.offset_type(), + table_id, + partition_id, + bucket_id: bucket_ids, + start_timestamp: offset_spec.start_timestamp(), + }, + } + } +} + +impl RequestBody for ListOffsetsRequest { + type ResponseBody = ListOffsetsResponse; + + const API_KEY: ApiKey = ApiKey::ListOffsets; + + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(ListOffsetsRequest); +impl_read_version_type!(ListOffsetsResponse); + +impl ListOffsetsResponse { + pub fn offsets(&self) -> FlussResult> { + self.buckets_resp + .iter() + .map(|resp| { + if resp.error_code.is_some() { + // todo: consider use another suitable error + Err(Error::WriteError(format!( + "Missing offset, error message: {}", + resp.error_message + .as_deref() + .unwrap_or("unknown server exception") + ))) + } else { + // if no error msg, offset must exists + Ok((resp.bucket_id, resp.offset.unwrap())) + } + }) + .collect() + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/message/mod.rs b/fluss-rust/crates/fluss/src/rpc/message/mod.rs index d5f8ebde89..230d971a49 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/mod.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/mod.rs @@ -31,6 +31,7 @@ mod get_latest_lake_snapshot; mod get_table; mod header; mod list_databases; +mod list_offsets; mod list_tables; mod produce_log; mod table_exists; @@ -47,6 +48,7 @@ pub use get_latest_lake_snapshot::*; pub use get_table::*; pub use header::*; pub use list_databases::*; +pub use list_offsets::*; pub use list_tables::*; pub use produce_log::*; pub use table_exists::*; From 2f722d2857f6859b2632d1d6008704fea18856f6 Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Thu, 16 Oct 2025 21:02:37 +0800 Subject: [PATCH 013/287] [test] Add IT for table operation in admin (#32) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --------- Co-authored-by: 王俊博(wangjunbo) Co-authored-by: luoyuxia --- fluss-rust/crates/fluss/src/metadata/table.rs | 6 +- .../crates/fluss/tests/integration/admin.rs | 122 +++++++++++++++++- 2 files changed, 123 insertions(+), 5 deletions(-) diff --git a/fluss-rust/crates/fluss/src/metadata/table.rs b/fluss-rust/crates/fluss/src/metadata/table.rs index 2b48ec60db..751dd6da02 100644 --- a/fluss-rust/crates/fluss/src/metadata/table.rs +++ b/fluss-rust/crates/fluss/src/metadata/table.rs @@ -23,7 +23,7 @@ use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::fmt::{Display, Formatter}; -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct Column { name: String, data_type: DataType, @@ -66,7 +66,7 @@ impl Column { } } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct PrimaryKey { constraint_name: String, column_names: Vec, @@ -90,7 +90,7 @@ impl PrimaryKey { } } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct Schema { columns: Vec, primary_key: Option, diff --git a/fluss-rust/crates/fluss/tests/integration/admin.rs b/fluss-rust/crates/fluss/tests/integration/admin.rs index 73f52db936..0d958a5656 100644 --- a/fluss-rust/crates/fluss/tests/integration/admin.rs +++ b/fluss-rust/crates/fluss/tests/integration/admin.rs @@ -33,7 +33,10 @@ static SHARED_FLUSS_CLUSTER: Lazy>>> = mod admin_test { use super::SHARED_FLUSS_CLUSTER; use crate::integration::fluss_cluster::{FlussTestingCluster, FlussTestingClusterBuilder}; - use fluss::metadata::DatabaseDescriptorBuilder; + use fluss::metadata::{ + DataTypes, DatabaseDescriptorBuilder, KvFormat, LogFormat, Schema, TableDescriptor, + TablePath, + }; use std::sync::Arc; fn before_all() { @@ -126,6 +129,121 @@ mod admin_test { #[tokio::test] async fn test_create_table() { - // todo + let cluster = get_fluss_cluster(); + let connection = cluster.get_fluss_connection().await; + let admin = connection + .get_admin() + .await + .expect("Failed to get admin client"); + + let test_db_name = "test_create_table_db"; + let db_descriptor = DatabaseDescriptorBuilder::default() + .comment("Database for test_create_table") + .build(); + + assert_eq!(admin.database_exists(test_db_name).await.unwrap(), false); + admin + .create_database(test_db_name, false, Some(&db_descriptor)) + .await + .expect("Failed to create test database"); + + let test_table_name = "test_user_table"; + let table_path = TablePath::new(test_db_name.to_string(), test_table_name.to_string()); + + // build table schema + let table_schema = Schema::builder() + .column("id", DataTypes::int()) + .column("name", DataTypes::string()) + .column("age", DataTypes::int()) + .with_comment("User's age (optional)") + .column("email", DataTypes::string()) + .primary_key(vec!["id".to_string()]) + .build() + .expect("Failed to build table schema"); + + // build table descriptor + let table_descriptor = TableDescriptor::builder() + .schema(table_schema.clone()) + .comment("Test table for user data (id, name, age, email)") + .distributed_by(Some(3), vec!["id".to_string()]) + .property("table.replication.factor", "1") + .log_format(LogFormat::ARROW) + .kv_format(KvFormat::INDEXED) + .build() + .expect("Failed to build table descriptor"); + + // create test table + admin + .create_table(&table_path, &table_descriptor, false) + .await + .expect("Failed to create test table"); + + assert!( + admin.table_exists(&table_path).await.unwrap(), + "Table {:?} should exist after creation", + table_path + ); + + let tables = admin.list_tables(test_db_name).await.unwrap(); + assert_eq!( + tables.len(), + 1, + "There should be exactly one table in the database" + ); + assert!( + tables.contains(&test_table_name.to_string()), + "Table list should contain the created table" + ); + + let table_info = admin + .get_table(&table_path) + .await + .expect("Failed to get table info"); + + // verify table comment + assert_eq!( + table_info.get_comment(), + Some("Test table for user data (id, name, age, email)"), + "Table comment mismatch" + ); + + // verify schema columns + let actual_schema = table_info.get_schema(); + assert_eq!(actual_schema, table_descriptor.schema(), "Schema mismatch"); + + // verify primary key + assert_eq!( + table_info.get_primary_keys(), + &vec!["id".to_string()], + "Primary key columns mismatch" + ); + + // verify distribution and properties + assert_eq!(table_info.get_num_buckets(), 3, "Bucket count mismatch"); + assert_eq!( + table_info.get_bucket_keys(), + &vec!["id".to_string()], + "Bucket keys mismatch" + ); + + assert_eq!( + table_info.get_properties(), + table_descriptor.properties(), + "Properties mismatch" + ); + + // drop table + admin + .drop_table(&table_path, false) + .await + .expect("Failed to drop table"); + // table shouldn't exist now + assert_eq!(admin.table_exists(&table_path).await.unwrap(), false); + + // drop database + admin.drop_database(test_db_name, false, true).await; + + // database shouldn't exist now + assert_eq!(admin.database_exists(test_db_name).await.unwrap(), false); } } From 1ec45d3c3e2d34d3bc1781d45895c09ee4483b23 Mon Sep 17 00:00:00 2001 From: naivedogger <59598718+naivedogger@users.noreply.github.com> Date: Fri, 17 Oct 2025 16:54:04 +0800 Subject: [PATCH 014/287] [feat] Add examples and stub files for Python bindings (#10) --------- Co-authored-by: luoyuxia --- fluss-rust/.licenserc.yaml | 1 + fluss-rust/bindings/python/README.md | 19 +- fluss-rust/bindings/python/example/example.py | 188 ++++++++++++++++++ fluss-rust/bindings/python/fluss/__init__.pyi | 171 ++++++++++++++++ fluss-rust/bindings/python/fluss/py.typed | 0 5 files changed, 369 insertions(+), 10 deletions(-) create mode 100644 fluss-rust/bindings/python/example/example.py create mode 100644 fluss-rust/bindings/python/fluss/__init__.pyi create mode 100644 fluss-rust/bindings/python/fluss/py.typed diff --git a/fluss-rust/.licenserc.yaml b/fluss-rust/.licenserc.yaml index 3813b484b3..a3cfcd146b 100644 --- a/fluss-rust/.licenserc.yaml +++ b/fluss-rust/.licenserc.yaml @@ -26,4 +26,5 @@ header: - 'LICENSE' - 'NOTICE' - 'DISCLAIMER' + - 'bindings/python/fluss/py.typed' comment: on-failure diff --git a/fluss-rust/bindings/python/README.md b/fluss-rust/bindings/python/README.md index 5258f53291..44d6099c6c 100644 --- a/fluss-rust/bindings/python/README.md +++ b/fluss-rust/bindings/python/README.md @@ -108,7 +108,7 @@ uv run python example/example.py ### Build API docs: ```bash -uv run pdoc fluss_python +uv run pdoc fluss ``` ### Release @@ -124,10 +124,10 @@ uv run maturin publish ## Project Structure ``` bindings/python/ -├── Cargo.toml # Rust dependency configuration -├── pyproject.toml # Python project configuration -├── README.md # This file -├── src/ # Rust source code +├── Cargo.toml # Rust dependency configuration +├── pyproject.toml # Python project configuration +├── README.md # This file +├── src/ # Rust source code │ ├── lib.rs # Main entry module │ ├── config.rs # Configuration related │ ├── connection.rs # Connection management @@ -135,11 +135,10 @@ bindings/python/ │ ├── table.rs # Table operations │ ├── types.rs # Data types │ └── error.rs # Error handling -├── python/ # Python package source -│ └── fluss_python/ -│ ├── __init__.py # Python package entry -│ ├── __init__.pyi # Stub file -│ └── py.typed # Type declarations +├── fluss/ # Python package source +│ ├── __init__.py # Python package entry +│ ├── __init__.pyi # Stub file +│ └── py.typed # Type declarations └── example/ # Example code └── example.py ``` diff --git a/fluss-rust/bindings/python/example/example.py b/fluss-rust/bindings/python/example/example.py new file mode 100644 index 0000000000..0523f943e4 --- /dev/null +++ b/fluss-rust/bindings/python/example/example.py @@ -0,0 +1,188 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import asyncio +import time + +import pandas as pd +import pyarrow as pa + +import fluss + + +async def main(): + # Create connection configuration + config_spec = { + "bootstrap.servers": "127.0.0.1:9123", + # Add other configuration options as needed + "request.max.size": "10485760", # 10 MB + "writer.acks": "all", # Wait for all replicas to acknowledge + "writer.retries": "3", # Retry up to 3 times on failure + "writer.batch.size": "1000", # Batch size for writes + } + config = fluss.Config(config_spec) + + # Create connection using the static connect method + conn = await fluss.FlussConnection.connect(config) + + # Define fields for PyArrow + fields = [ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field("score", pa.float32()), + pa.field("age", pa.int32()), + ] + + # Create a PyArrow schema + schema = pa.schema(fields) + + # Create a Fluss Schema first (this is what TableDescriptor expects) + fluss_schema = fluss.Schema(schema) + + # Create a Fluss TableDescriptor + table_descriptor = fluss.TableDescriptor(fluss_schema) + + # Get the admin for Fluss + admin = await conn.get_admin() + + # Create a Fluss table + table_path = fluss.TablePath("fluss", "sample_table") + + try: + await admin.create_table(table_path, table_descriptor, True) + print(f"Created table: {table_path}") + except Exception as e: + print(f"Table creation failed: {e}") + + # Get table information via admin + try: + table_info = await admin.get_table(table_path) + print(f"Table info: {table_info}") + print(f"Table ID: {table_info.table_id}") + print(f"Schema ID: {table_info.schema_id}") + print(f"Created time: {table_info.created_time}") + print(f"Primary keys: {table_info.get_primary_keys()}") + except Exception as e: + print(f"Failed to get table info: {e}") + + # Get the table instance + table = await conn.get_table(table_path) + print(f"Got table: {table}") + + # Create a writer for the table + append_writer = await table.new_append_writer() + print(f"Created append writer: {append_writer}") + + try: + # Test 1: Write PyArrow Table + print("\n--- Testing PyArrow Table write ---") + pa_table = pa.Table.from_arrays( + [ + pa.array([1, 2, 3], type=pa.int32()), + pa.array(["Alice", "Bob", "Charlie"], type=pa.string()), + pa.array([95.2, 87.2, 92.1], type=pa.float32()), + pa.array([25, 30, 35], type=pa.int32()), + ], + schema=schema, + ) + + append_writer.write_arrow(pa_table) + print("Successfully wrote PyArrow Table") + + # Test 2: Write PyArrow RecordBatch + print("\n--- Testing PyArrow RecordBatch write ---") + pa_record_batch = pa.RecordBatch.from_arrays( + [ + pa.array([4, 5], type=pa.int32()), + pa.array(["David", "Eve"], type=pa.string()), + pa.array([88.5, 91.0], type=pa.float32()), + pa.array([28, 32], type=pa.int32()), + ], + schema=schema, + ) + + append_writer.write_arrow_batch(pa_record_batch) + print("Successfully wrote PyArrow RecordBatch") + + # Test 3: Write Pandas DataFrame + print("\n--- Testing Pandas DataFrame write ---") + df = pd.DataFrame( + { + "id": [6, 7], + "name": ["Frank", "Grace"], + "score": [89.3, 94.7], + "age": [29, 27], + } + ) + + append_writer.write_pandas(df) + print("Successfully wrote Pandas DataFrame") + + # Flush all pending data + print("\n--- Flushing data ---") + append_writer.flush() + print("Successfully flushed data") + + except Exception as e: + print(f"Error during writing: {e}") + + # Now scan the table to verify data was written + print("\n--- Scanning table ---") + try: + log_scanner = await table.new_log_scanner() + print(f"Created log scanner: {log_scanner}") + + # Subscribe to scan from earliest to latest + # start_timestamp=None (earliest), end_timestamp=None (latest) + log_scanner.subscribe(None, None) + + print("Scanning results using to_arrow():") + + # Try to get as PyArrow Table + try: + pa_table_result = log_scanner.to_arrow() + print(f"\nAs PyArrow Table: {pa_table_result}") + except Exception as e: + print(f"Could not convert to PyArrow: {e}") + + # Let's subscribe from the beginning again. + # Reset subscription + log_scanner.subscribe(None, None) + + # Try to get as Pandas DataFrame + try: + df_result = log_scanner.to_pandas() + print(f"\nAs Pandas DataFrame:\n{df_result}") + except Exception as e: + print(f"Could not convert to Pandas: {e}") + + # TODO: support to_arrow_batch_reader() + # which is reserved for streaming use cases + + # TODO: support to_duckdb() + + except Exception as e: + print(f"Error during scanning: {e}") + + # Close connection + conn.close() + print("\nConnection closed") + + +if __name__ == "__main__": + # Run the async main function + asyncio.run(main()) diff --git a/fluss-rust/bindings/python/fluss/__init__.pyi b/fluss-rust/bindings/python/fluss/__init__.pyi new file mode 100644 index 0000000000..45652425ba --- /dev/null +++ b/fluss-rust/bindings/python/fluss/__init__.pyi @@ -0,0 +1,171 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Type stubs for Fluss Python bindings.""" + +from types import TracebackType +from typing import Dict, List, Optional, Tuple + +import pandas as pd +import pyarrow as pa + +class Config: + def __init__(self, properties: Optional[Dict[str, str]] = None) -> None: ... + @property + def bootstrap_server(self) -> Optional[str]: ... + @bootstrap_server.setter + def bootstrap_server(self, server: str) -> None: ... + @property + def request_max_size(self) -> int: ... + @request_max_size.setter + def request_max_size(self, size: int) -> None: ... + @property + def writer_batch_size(self) -> int: ... + @writer_batch_size.setter + def writer_batch_size(self, size: int) -> None: ... + +class FlussConnection: + @staticmethod + async def connect(config: Config) -> FlussConnection: ... + async def get_admin(self) -> FlussAdmin: ... + async def get_table(self, table_path: TablePath) -> FlussTable: ... + def close(self) -> None: ... + def __enter__(self) -> FlussConnection: ... + def __exit__(self, exc_type: Optional[type], exc_value: Optional[BaseException], traceback: Optional[TracebackType]) -> bool: ... + def __repr__(self) -> str: ... + +class FlussAdmin: + async def create_table( + self, + table_path: TablePath, + table_descriptor: TableDescriptor, + ignore_if_exists: Optional[bool] = False, + ) -> None: ... + async def get_table(self, table_path: TablePath) -> TableInfo: ... + async def get_latest_lake_snapshot(self, table_path: TablePath) -> LakeSnapshot: ... + def __repr__(self) -> str: ... + +class FlussTable: + async def new_append_writer(self) -> AppendWriter: ... + async def new_log_scanner(self) -> LogScanner: ... + def get_table_info(self) -> TableInfo: ... + def get_table_path(self) -> TablePath: ... + def has_primary_key(self) -> bool: ... + def __repr__(self) -> str: ... + +class AppendWriter: + def write_arrow(self, table: pa.Table) -> None: ... + def write_arrow_batch(self, batch: pa.RecordBatch) -> None: ... + def write_pandas(self, df: pd.DataFrame) -> None: ... + def flush(self) -> None: ... + def __repr__(self) -> str: ... + +class LogScanner: + def subscribe( + self, start_timestamp: Optional[int], end_timestamp: Optional[int] + ) -> None: ... + def to_pandas(self) -> pd.DataFrame: ... + def to_arrow(self) -> pa.Table: ... + def __repr__(self) -> str: ... + +class Schema: + def __init__(self, schema: pa.Schema, primary_keys: Optional[List[str]] = None) -> None: ... + def get_column_names(self) -> List[str]: ... + def get_column_types(self) -> List[str]: ... + def get_columns(self) -> List[Tuple[str,str]]: ... + def __str__(self) -> str: ... + +class TableDescriptor: + def __init__(self, schema: Schema, **kwargs: str) -> None: ... + def get_schema(self) -> Schema: ... + +class TablePath: + def __init__(self, database: str, table: str) -> None: ... + @property + def database_name(self) -> str: ... + @property + def table_name(self) -> str: ... + def table_path_str(self) -> str: ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + def __hash__(self) -> int: ... + def __eq__(self, other: object) -> bool: ... + +class TableInfo: + @property + def table_id(self) -> int: ... + @property + def schema_id(self) -> int: ... + @property + def created_time(self) -> int: ... + @property + def modified_time(self) -> int: ... + @property + def table_path(self) -> TablePath: ... + @property + def num_buckets(self) -> int: ... + @property + def comment(self) -> Optional[str]: ... + def get_primary_keys(self) -> List[str]: ... + def get_bucket_keys(self) -> List[str]: ... + def get_partition_keys(self) -> List[str]: ... + def has_primary_key(self) -> bool: ... + def is_partitioned(self) -> bool: ... + def get_properties(self) -> Dict[str, str]: ... + def get_custom_properties(self) -> Dict[str, str]: ... + def get_schema(self) -> Schema: ... + def get_column_names(self) -> List[str]: ... + def get_column_count(self) -> int: ... + +class FlussError(Exception): + message: str + def __init__(self, message: str) -> None: ... + def __str__(self) -> str: ... + +class LakeSnapshot: + def __init__(self, snapshot_id: int) -> None: ... + @property + def snapshot_id(self) -> int: ... + @property + def table_buckets_offset(self) -> Dict[TableBucket, int]: ... + def get_bucket_offset(self, bucket: TableBucket) -> Optional[int]: ... + def get_table_buckets(self) -> List[TableBucket]: ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + +class TableBucket: + def __init__(self, table_id: int, bucket: int) -> None: ... + @staticmethod + def with_partition( + table_id: int, partition_id: int, bucket: int + ) -> TableBucket: ... + @property + def table_id(self) -> int: ... + @property + def bucket_id(self) -> int: ... + @property + def partition_id(self) -> Optional[int]: ... + def __hash__(self) -> int: ... + def __eq__(self, other: object) -> bool: ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + +class TableDistribution: + def bucket_keys(self) -> List[str]: ... + def bucket_count(self) -> Optional[int]: ... + +__version__: str diff --git a/fluss-rust/bindings/python/fluss/py.typed b/fluss-rust/bindings/python/fluss/py.typed new file mode 100644 index 0000000000..e69de29bb2 From c0374eb31673e5aca338989bfc2f6d3829a1477f Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Mon, 20 Oct 2025 14:14:28 +0800 Subject: [PATCH 015/287] [chore] Move list offsets to admin (#35) --- fluss-rust/bindings/python/src/table.rs | 68 ++++++------ fluss-rust/bindings/python/src/utils.rs | 4 +- fluss-rust/crates/fluss/src/client/admin.rs | 104 +++++++++++++++++- .../crates/fluss/src/client/metadata.rs | 3 +- .../crates/fluss/src/client/table/mod.rs | 2 + .../crates/fluss/src/client/table/scanner.rs | 101 ----------------- .../crates/fluss/src/client/write/sender.rs | 2 +- fluss-rust/crates/fluss/src/record/mod.rs | 4 + fluss-rust/crates/fluss/src/rpc/mod.rs | 2 - 9 files changed, 149 insertions(+), 141 deletions(-) diff --git a/fluss-rust/bindings/python/src/table.rs b/fluss-rust/bindings/python/src/table.rs index 98943b939a..c255fa6f8a 100644 --- a/fluss-rust/bindings/python/src/table.rs +++ b/fluss-rust/bindings/python/src/table.rs @@ -17,12 +17,11 @@ use crate::TOKIO_RUNTIME; use crate::*; +use fluss::client::EARLIEST_OFFSET; +use fluss::rpc::message::OffsetSpec; use pyo3_async_runtimes::tokio::future_into_py; -use std::collections::HashSet; use std::sync::Arc; -const EARLIEST_OFFSET: i64 = -2; - /// Represents a Fluss table for data operations #[pyclass] pub struct FlussTable { @@ -70,8 +69,12 @@ impl FlussTable { let rust_scanner = table_scan.create_log_scanner(); - let py_scanner = LogScanner::from_core(rust_scanner, table_info.clone()); + let admin = conn + .get_admin() + .await + .map_err(|e| FlussError::new_err(e.to_string()))?; + let py_scanner = LogScanner::from_core(rust_scanner, admin, table_info.clone()); Python::with_gil(|py| Py::new(py, py_scanner)) }) } @@ -275,6 +278,7 @@ impl AppendWriter { #[pyclass] pub struct LogScanner { inner: fcore::client::LogScanner, + admin: fcore::client::FlussAdmin, table_info: fcore::metadata::TableInfo, #[allow(dead_code)] start_timestamp: Option, @@ -327,50 +331,50 @@ impl LogScanner { let bucket_ids: Vec = (0..num_buckets).collect(); // todo: after supporting list_offsets with timestamp, we can use start_timestamp and end_timestamp here - let target_offsets: HashMap = TOKIO_RUNTIME - .block_on(async { self.inner.list_offsets_latest(bucket_ids).await }) + let mut stopping_offsets: HashMap = TOKIO_RUNTIME + .block_on(async { + self.admin + .list_offsets( + &self.table_info.table_path, + bucket_ids.as_slice(), + OffsetSpec::Latest, + ) + .await + }) .map_err(|e| FlussError::new_err(e.to_string()))?; - let mut current_offsets: HashMap = HashMap::new(); - let mut completed_buckets: HashSet = HashSet::new(); - - if !target_offsets.is_empty() { + if !stopping_offsets.is_empty() { loop { let batch_result = TOKIO_RUNTIME .block_on(async { self.inner.poll(Duration::from_millis(500)).await }); match batch_result { Ok(scan_records) => { - let mut filtered_records: HashMap< - fcore::metadata::TableBucket, - Vec, - > = HashMap::new(); - for (bucket, records) in scan_records.records_by_buckets() { - let bucket_id = bucket.bucket_id(); - if completed_buckets.contains(&bucket_id) { + let mut result_records: Vec = vec![]; + for (bucket, records) in scan_records.into_records_by_buckets() { + let stopping_offset = stopping_offsets.get(&bucket.bucket_id()); + + if stopping_offset.is_none() { + // not to include this bucket, skip records for this bucket + // since we already reach end offset for this bucket continue; } if let Some(last_record) = records.last() { let offset = last_record.offset(); - current_offsets.insert(bucket_id, offset); - filtered_records.insert(bucket.clone(), records.clone()); - if offset >= target_offsets[&bucket_id] - 1 { - completed_buckets.insert(bucket_id); + result_records.extend(records); + if offset >= stopping_offset.unwrap() - 1 { + stopping_offsets.remove(&bucket.bucket_id()); } } } - if !filtered_records.is_empty() { - let filtered_scan_records = - fcore::record::ScanRecords::new(filtered_records); - let arrow_batch = - Utils::convert_scan_records_to_arrow(filtered_scan_records); + if !result_records.is_empty() { + let arrow_batch = Utils::convert_scan_records_to_arrow(result_records); all_batches.extend(arrow_batch); } - // completed bucket is equal to all target buckets, - // we can break scan records - if completed_buckets.len() == target_offsets.len() { + // we have reach end offsets of all bucket + if stopping_offsets.is_empty() { break; } } @@ -399,11 +403,13 @@ impl LogScanner { impl LogScanner { /// Create LogScanner from core LogScanner pub fn from_core( - inner: fcore::client::LogScanner, + inner_scanner: fcore::client::LogScanner, + admin: fcore::client::FlussAdmin, table_info: fcore::metadata::TableInfo, ) -> Self { Self { - inner, + inner: inner_scanner, + admin, table_info, start_timestamp: None, end_timestamp: None, diff --git a/fluss-rust/bindings/python/src/utils.rs b/fluss-rust/bindings/python/src/utils.rs index 9642e9d95b..93933b3774 100644 --- a/fluss-rust/bindings/python/src/utils.rs +++ b/fluss-rust/bindings/python/src/utils.rs @@ -152,9 +152,9 @@ impl Utils { .map_err(|e| FlussError::new_err(format!("Invalid kv format '{format_str}': {e}"))) } - /// Convert ScanRecords to Arrow RecordBatch + /// Convert Vec to Arrow RecordBatch pub fn convert_scan_records_to_arrow( - _scan_records: fcore::record::ScanRecords, + _scan_records: Vec, ) -> Vec> { let mut result = Vec::new(); for record in _scan_records { diff --git a/fluss-rust/crates/fluss/src/client/admin.rs b/fluss-rust/crates/fluss/src/client/admin.rs index fd0f316374..fefab43520 100644 --- a/fluss-rust/crates/fluss/src/client/admin.rs +++ b/fluss-rust/crates/fluss/src/client/admin.rs @@ -25,13 +25,16 @@ use crate::rpc::message::{ DropTableRequest, GetDatabaseInfoRequest, GetLatestLakeSnapshotRequest, GetTableRequest, ListDatabasesRequest, ListTablesRequest, TableExistsRequest, }; +use crate::rpc::message::{ListOffsetsRequest, OffsetSpec}; use crate::rpc::{RpcClient, ServerConnection}; -use std::collections::HashMap; -use std::sync::Arc; - +use crate::BucketId; use crate::error::Result; use crate::proto::GetTableInfoResponse; +use std::collections::HashMap; +use std::slice::from_ref; +use std::sync::Arc; +use tokio::task::JoinHandle; pub struct FlussAdmin { admin_gateway: ServerConnection, @@ -216,4 +219,99 @@ impl FlussAdmin { table_buckets_offset, )) } + + /// List offset for the specified buckets. This operation enables to find the beginning offset, + /// end offset as well as the offset matching a timestamp in buckets. + pub async fn list_offsets( + &self, + table_path: &TablePath, + buckets_id: &[BucketId], + offset_spec: OffsetSpec, + ) -> Result> { + self.metadata + .check_and_update_table_metadata(from_ref(table_path)) + .await?; + + let cluster = self.metadata.get_cluster(); + let table_id = cluster.get_table(table_path).table_id; + + // Prepare requests + let requests_by_server = + self.prepare_list_offsets_requests(table_id, None, buckets_id, offset_spec)?; + + // Send Requests + let response_futures = self.send_list_offsets_request(requests_by_server).await?; + + let mut results = HashMap::new(); + + for response_future in response_futures { + let offsets = response_future.await.map_err( + // todo: consider use suitable error + |e| crate::error::Error::WriteError(format!("Fail to get result: {e}")), + )?; + results.extend(offsets?); + } + Ok(results) + } + + fn prepare_list_offsets_requests( + &self, + table_id: i64, + partition_id: Option, + buckets: &[BucketId], + offset_spec: OffsetSpec, + ) -> Result> { + let cluster = self.metadata.get_cluster(); + let mut node_for_bucket_list: HashMap> = HashMap::new(); + + for bucket_id in buckets { + let table_bucket = TableBucket::new(table_id, *bucket_id); + let leader = cluster.leader_for(&table_bucket).ok_or_else(|| { + // todo: consider use another suitable error + crate::error::Error::InvalidTableError(format!( + "No leader found for table bucket: table_id={table_id}, bucket_id={bucket_id}" + )) + })?; + + node_for_bucket_list + .entry(leader.id()) + .or_default() + .push(*bucket_id); + } + + let mut list_offsets_requests = HashMap::new(); + for (leader_id, bucket_ids) in node_for_bucket_list { + let request = + ListOffsetsRequest::new(table_id, partition_id, bucket_ids, offset_spec.clone()); + list_offsets_requests.insert(leader_id, request); + } + Ok(list_offsets_requests) + } + + async fn send_list_offsets_request( + &self, + request_map: HashMap, + ) -> Result>>>> { + let mut tasks = Vec::new(); + + for (leader_id, request) in request_map { + let rpc_client = self.rpc_client.clone(); + let metadata = self.metadata.clone(); + + let task = tokio::spawn(async move { + let cluster = metadata.get_cluster(); + let tablet_server = cluster.get_tablet_server(leader_id).ok_or_else(|| { + // todo: consider use more suitable error + crate::error::Error::InvalidTableError(format!( + "Tablet server {leader_id} not found" + )) + })?; + let connection = rpc_client.get_connection(tablet_server).await?; + let list_offsets_response = connection.request(request).await?; + list_offsets_response.offsets() + }); + tasks.push(task); + } + Ok(tasks) + } } diff --git a/fluss-rust/crates/fluss/src/client/metadata.rs b/fluss-rust/crates/fluss/src/client/metadata.rs index ebfb959f65..3c3ba4bd2e 100644 --- a/fluss-rust/crates/fluss/src/client/metadata.rs +++ b/fluss-rust/crates/fluss/src/client/metadata.rs @@ -17,7 +17,8 @@ use crate::cluster::{Cluster, ServerNode, ServerType}; use crate::metadata::{TableBucket, TablePath}; -use crate::rpc::{RpcClient, ServerConnection, UpdateMetadataRequest}; +use crate::rpc::message::UpdateMetadataRequest; +use crate::rpc::{RpcClient, ServerConnection}; use parking_lot::RwLock; use std::collections::HashSet; use std::net::SocketAddr; diff --git a/fluss-rust/crates/fluss/src/client/table/mod.rs b/fluss-rust/crates/fluss/src/client/table/mod.rs index 07e64948fa..52ae700fc6 100644 --- a/fluss-rust/crates/fluss/src/client/table/mod.rs +++ b/fluss-rust/crates/fluss/src/client/table/mod.rs @@ -22,6 +22,8 @@ use std::sync::Arc; use crate::error::Result; +pub const EARLIEST_OFFSET: i64 = -2; + mod append; mod scanner; diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index cbe724896e..e1ab59ffbb 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -22,14 +22,12 @@ use crate::metadata::{TableBucket, TableInfo, TablePath}; use crate::proto::{FetchLogRequest, PbFetchLogReqForBucket, PbFetchLogReqForTable}; use crate::record::{LogRecordsBatchs, ReadContext, ScanRecord, ScanRecords, to_arrow_schema}; use crate::rpc::RpcClient; -use crate::rpc::message::{ListOffsetsRequest, OffsetSpec}; use crate::util::FairBucketStatusMap; use parking_lot::RwLock; use std::collections::HashMap; use std::slice::from_ref; use std::sync::Arc; use std::time::Duration; -use tokio::task::JoinHandle; const LOG_FETCH_MAX_BYTES: i32 = 16 * 1024 * 1024; #[allow(dead_code)] @@ -67,7 +65,6 @@ pub struct LogScanner { metadata: Arc, log_scanner_status: Arc, log_fetcher: LogFetcher, - conns: Arc, } impl LogScanner { @@ -88,7 +85,6 @@ impl LogScanner { metadata.clone(), log_scanner_status.clone(), ), - conns: connections.clone(), } } @@ -106,103 +102,6 @@ impl LogScanner { Ok(()) } - pub async fn list_offsets_latest(&self, buckets: Vec) -> Result> { - // TODO: support partition_id - let partition_id = None; - let offset_spec = OffsetSpec::Latest; - - self.metadata - .check_and_update_table_metadata(from_ref(&self.table_path)) - .await?; - - let cluster = self.metadata.get_cluster(); - let table_id = cluster.get_table(&self.table_path).table_id; - - // Prepare requests - let requests_by_server = self.prepare_list_offsets_requests( - table_id, - partition_id, - buckets.clone(), - offset_spec, - )?; - - // Send Requests - let response_futures = self.send_list_offsets_request(requests_by_server).await?; - - let mut results = HashMap::new(); - - for response_future in response_futures { - let offsets = response_future.await.map_err( - // todo: consider use suitable error - |e| crate::error::Error::WriteError(format!("Fail to get result: {e}")), - )?; - results.extend(offsets?); - } - Ok(results) - } - - fn prepare_list_offsets_requests( - &self, - table_id: i64, - partition_id: Option, - buckets: Vec, - offset_spec: OffsetSpec, - ) -> Result> { - let cluster = self.metadata.get_cluster(); - let mut node_for_bucket_list: HashMap> = HashMap::new(); - - for bucket_id in buckets { - let table_bucket = TableBucket::new(table_id, bucket_id); - let leader = cluster.leader_for(&table_bucket).ok_or_else(|| { - // todo: consider use another suitable error - crate::error::Error::InvalidTableError(format!( - "No leader found for table bucket: table_id={table_id}, bucket_id={bucket_id}" - )) - })?; - - node_for_bucket_list - .entry(leader.id()) - .or_default() - .push(bucket_id); - } - - let mut list_offsets_requests = HashMap::new(); - for (leader_id, bucket_ids) in node_for_bucket_list { - let request = - ListOffsetsRequest::new(table_id, partition_id, bucket_ids, offset_spec.clone()); - list_offsets_requests.insert(leader_id, request); - } - Ok(list_offsets_requests) - } - - async fn send_list_offsets_request( - &self, - request_map: HashMap, - ) -> Result>>>> { - let mut tasks = Vec::new(); - - for (leader_id, request) in request_map { - let rpc_client = self.conns.clone(); - let metadata = self.metadata.clone(); - - let task = tokio::spawn(async move { - let cluster = metadata.get_cluster(); - let tablet_server = cluster.get_tablet_server(leader_id).ok_or_else(|| { - // todo: consider use more suitable error - crate::error::Error::InvalidTableError(format!( - "Tablet server {leader_id} not found" - )) - })?; - let connection = rpc_client.get_connection(tablet_server).await?; - let list_offsets_response = connection.request(request).await?; - list_offsets_response.offsets() - }); - tasks.push(task); - } - - Ok(tasks) - } - async fn poll_for_fetches(&self) -> Result>> { self.log_fetcher.send_fetches_and_collect().await } diff --git a/fluss-rust/crates/fluss/src/client/write/sender.rs b/fluss-rust/crates/fluss/src/client/write/sender.rs index 381e10c5b1..e25e2bace2 100644 --- a/fluss-rust/crates/fluss/src/client/write/sender.rs +++ b/fluss-rust/crates/fluss/src/client/write/sender.rs @@ -21,7 +21,7 @@ use crate::error::Error::WriteError; use crate::error::Result; use crate::metadata::TableBucket; use crate::proto::ProduceLogResponse; -use crate::rpc::ProduceLogRequest; +use crate::rpc::message::ProduceLogRequest; use parking_lot::Mutex; use std::collections::HashMap; use std::sync::Arc; diff --git a/fluss-rust/crates/fluss/src/record/mod.rs b/fluss-rust/crates/fluss/src/record/mod.rs index 07fbe0808c..35928ea082 100644 --- a/fluss-rust/crates/fluss/src/record/mod.rs +++ b/fluss-rust/crates/fluss/src/record/mod.rs @@ -163,6 +163,10 @@ impl ScanRecords { pub fn records_by_buckets(&self) -> &HashMap> { &self.records } + + pub fn into_records_by_buckets(self) -> HashMap> { + self.records + } } impl IntoIterator for ScanRecords { diff --git a/fluss-rust/crates/fluss/src/rpc/mod.rs b/fluss-rust/crates/fluss/src/rpc/mod.rs index 496c015073..b8705a3f65 100644 --- a/fluss-rust/crates/fluss/src/rpc/mod.rs +++ b/fluss-rust/crates/fluss/src/rpc/mod.rs @@ -26,6 +26,4 @@ pub use server_connection::*; mod convert; mod transport; -pub use message::*; - pub use convert::*; From b27625907e11d9ad51251db40e84d5c968a57409 Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Tue, 21 Oct 2025 15:04:50 +0800 Subject: [PATCH 016/287] [feat] Support append arrow record batch (#34) --- fluss-rust/.github/workflows/ci.yml | 2 +- .../crates/fluss/src/client/table/append.rs | 11 +- .../fluss/src/client/write/accumulator.rs | 4 +- .../crates/fluss/src/client/write/batch.rs | 22 +- .../crates/fluss/src/client/write/mod.rs | 20 +- .../crates/fluss/src/client/write/sender.rs | 1 - .../fluss/src/client/write/writer_client.rs | 33 +-- fluss-rust/crates/fluss/src/record/arrow.rs | 211 +++++++++++++----- .../crates/fluss/tests/integration/admin.rs | 8 +- .../fluss/tests/integration/fluss_cluster.rs | 67 ++++-- .../crates/fluss/tests/integration/table.rs | 132 +++++++++++ .../crates/fluss/tests/integration/utils.rs | 30 +++ fluss-rust/crates/fluss/tests/test_fluss.rs | 3 + 13 files changed, 443 insertions(+), 101 deletions(-) create mode 100644 fluss-rust/crates/fluss/tests/integration/table.rs create mode 100644 fluss-rust/crates/fluss/tests/integration/utils.rs diff --git a/fluss-rust/.github/workflows/ci.yml b/fluss-rust/.github/workflows/ci.yml index 73e2b3f172..69625f8f2e 100644 --- a/fluss-rust/.github/workflows/ci.yml +++ b/fluss-rust/.github/workflows/ci.yml @@ -91,7 +91,7 @@ jobs: # only run IT in linux since no docker in macos by default run: | if [ "$RUNNER_OS" == "Linux" ]; then - cargo test --features integration_tests --all-targets --workspace + RUST_TEST_THREADS=1 cargo test --features integration_tests --all-targets --workspace -- --nocapture fi env: RUST_LOG: DEBUG diff --git a/fluss-rust/crates/fluss/src/client/table/append.rs b/fluss-rust/crates/fluss/src/client/table/append.rs index bf15266706..ad3e55e288 100644 --- a/fluss-rust/crates/fluss/src/client/table/append.rs +++ b/fluss-rust/crates/fluss/src/client/table/append.rs @@ -16,12 +16,12 @@ // under the License. use crate::client::{WriteRecord, WriterClient}; +use crate::error::Result; use crate::metadata::{TableInfo, TablePath}; use crate::row::GenericRow; +use arrow::array::RecordBatch; use std::sync::Arc; -use crate::error::Result; - #[allow(dead_code)] pub struct TableAppend { table_path: TablePath, @@ -63,6 +63,13 @@ impl AppendWriter { result_handle.result(result) } + pub async fn append_arrow_batch(&self, batch: RecordBatch) -> Result<()> { + let record = WriteRecord::new_record_batch(self.table_path.clone(), batch); + let result_handle = self.writer_client.send(&record).await?; + let result = result_handle.wait().await?; + result_handle.result(result) + } + pub async fn flush(&self) -> Result<()> { self.writer_client.flush().await } diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs index 32622c7b2d..e4ca957827 100644 --- a/fluss-rust/crates/fluss/src/client/write/accumulator.rs +++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs @@ -17,7 +17,7 @@ use crate::client::write::batch::WriteBatch::ArrowLog; use crate::client::write::batch::{ArrowLogWriteBatch, WriteBatch}; -use crate::client::{ResultHandle, WriteRecord}; +use crate::client::{Record, ResultHandle, WriteRecord}; use crate::cluster::{BucketLocation, Cluster, ServerNode}; use crate::config::Config; use crate::error::Result; @@ -105,6 +105,7 @@ impl RecordAccumulator { row_type, bucket_id, current_time_ms(), + matches!(record.row, Record::RecordBatch(_)), )); let batch_id = batch.batch_id(); @@ -159,7 +160,6 @@ impl RecordAccumulator { true, false, true, )); } - self.append_new_batch(cluster, record, bucket_id, &mut dq_guard) } diff --git a/fluss-rust/crates/fluss/src/client/write/batch.rs b/fluss-rust/crates/fluss/src/client/write/batch.rs index 64c5dd6517..13b3d36402 100644 --- a/fluss-rust/crates/fluss/src/client/write/batch.rs +++ b/fluss-rust/crates/fluss/src/client/write/batch.rs @@ -18,11 +18,10 @@ use crate::BucketId; use crate::client::broadcast::{BatchWriteResult, BroadcastOnce}; use crate::client::{ResultHandle, WriteRecord}; -use crate::metadata::{DataType, TablePath}; -use std::cmp::max; - use crate::error::Result; +use crate::metadata::{DataType, TablePath}; use crate::record::MemoryLogRecordsArrowBuilder; +use std::cmp::max; #[allow(dead_code)] pub struct InnerWriteBatch { @@ -140,12 +139,16 @@ impl ArrowLogWriteBatch { row_type: &DataType, bucket_id: BucketId, create_ms: i64, + to_append_record_batch: bool, ) -> Self { let base = InnerWriteBatch::new(batch_id, table_path, create_ms, bucket_id); - Self { write_batch: base, - arrow_builder: MemoryLogRecordsArrowBuilder::new(schema_id, row_type), + arrow_builder: MemoryLogRecordsArrowBuilder::new( + schema_id, + row_type, + to_append_record_batch, + ), } } @@ -157,8 +160,13 @@ impl ArrowLogWriteBatch { if self.arrow_builder.is_closed() || self.arrow_builder.is_full() { Ok(None) } else { - self.arrow_builder.append(&write_record.row)?; - Ok(Some(ResultHandle::new(self.write_batch.results.receiver()))) + // append successfully + if self.arrow_builder.append(write_record)? { + Ok(Some(ResultHandle::new(self.write_batch.results.receiver()))) + } else { + // append fail + Ok(None) + } } } diff --git a/fluss-rust/crates/fluss/src/client/write/mod.rs b/fluss-rust/crates/fluss/src/client/write/mod.rs index 74df951115..e632cde451 100644 --- a/fluss-rust/crates/fluss/src/client/write/mod.rs +++ b/fluss-rust/crates/fluss/src/client/write/mod.rs @@ -23,6 +23,7 @@ use crate::error::Error; use crate::metadata::TablePath; use crate::row::GenericRow; pub use accumulator::*; +use arrow::array::RecordBatch; use std::sync::Arc; pub(crate) mod broadcast; @@ -34,13 +35,28 @@ mod writer_client; pub use writer_client::WriterClient; pub struct WriteRecord<'a> { - pub row: GenericRow<'a>, + pub row: Record<'a>, pub table_path: Arc, } +pub enum Record<'a> { + Row(GenericRow<'a>), + RecordBatch(Arc), +} + impl<'a> WriteRecord<'a> { pub fn new(table_path: Arc, row: GenericRow<'a>) -> Self { - Self { row, table_path } + Self { + row: Record::Row(row), + table_path, + } + } + + pub fn new_record_batch(table_path: Arc, row: RecordBatch) -> Self { + Self { + row: Record::RecordBatch(Arc::new(row)), + table_path, + } } } diff --git a/fluss-rust/crates/fluss/src/client/write/sender.rs b/fluss-rust/crates/fluss/src/client/write/sender.rs index e25e2bace2..27460e3863 100644 --- a/fluss-rust/crates/fluss/src/client/write/sender.rs +++ b/fluss-rust/crates/fluss/src/client/write/sender.rs @@ -122,7 +122,6 @@ impl Sender { collated: &HashMap>>, ) -> Result<()> { for (leader_id, batches) in collated { - println!("send request batch"); self.send_write_request(*leader_id, self.ack, batches) .await?; } diff --git a/fluss-rust/crates/fluss/src/client/write/writer_client.rs b/fluss-rust/crates/fluss/src/client/write/writer_client.rs index 01fe2899ba..28f5371e8d 100644 --- a/fluss-rust/crates/fluss/src/client/write/writer_client.rs +++ b/fluss-rust/crates/fluss/src/client/write/writer_client.rs @@ -90,20 +90,12 @@ impl WriterClient { let table_path = &record.table_path; let cluster = self.metadata.get_cluster(); - let bucket_assigner = { - if let Some(assigner) = self.bucket_assigners.get(table_path) { - assigner.clone() - } else { - let assigner = Arc::new(Self::create_bucket_assigner(table_path.as_ref())); - self.bucket_assigners - .insert(table_path.as_ref().clone(), assigner.clone()); - assigner - } - }; + let (bucket_assigner, bucket_id) = self.assign_bucket(table_path); - let bucket_id = bucket_assigner.assign_bucket(None, &cluster); - - let mut result = self.accumulate.append(record, 1, &cluster, true).await?; + let mut result = self + .accumulate + .append(record, bucket_id, &cluster, true) + .await?; if result.abort_record_for_new_batch { let prev_bucket_id = bucket_id; @@ -121,6 +113,21 @@ impl WriterClient { Ok(result.result_handle.expect("result_handle should exist")) } + fn assign_bucket(&self, table_path: &Arc) -> (Arc>, i32) { + let cluster = self.metadata.get_cluster(); + let bucket_assigner = { + if let Some(assigner) = self.bucket_assigners.get(table_path) { + assigner.clone() + } else { + let assigner = Arc::new(Self::create_bucket_assigner(table_path.as_ref())); + self.bucket_assigners + .insert(table_path.as_ref().clone(), assigner.clone()); + assigner + } + }; + let bucket_id = bucket_assigner.assign_bucket(None, &cluster); + (bucket_assigner, bucket_id) + } pub async fn close(self) -> Result<()> { self.shutdown_tx diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index fa63b00603..487f50c348 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -15,6 +15,11 @@ // specific language governing permissions and limitations // under the License. +use crate::client::{Record, WriteRecord}; +use crate::error::Result; +use crate::metadata::DataType; +use crate::record::{ChangeType, ScanRecord}; +use crate::row::{ColumnarRow, GenericRow}; use arrow::array::{ ArrayBuilder, ArrayRef, BinaryBuilder, BooleanBuilder, Float32Builder, Float64Builder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, StringBuilder, UInt8Builder, @@ -35,11 +40,6 @@ use std::{ sync::Arc, }; -use crate::error::Result; -use crate::metadata::DataType; -use crate::record::{ChangeType, ScanRecord}; -use crate::row::{ColumnarRow, GenericRow}; - /// const for record batch pub const BASE_OFFSET_LENGTH: usize = 8; pub const LENGTH_LENGTH: usize = 4; @@ -95,14 +95,71 @@ pub struct MemoryLogRecordsArrowBuilder { magic: u8, writer_id: i64, batch_sequence: i32, + arrow_record_batch_builder: Box, + is_closed: bool, +} + +pub trait ArrowRecordBatchInnerBuilder: Send + Sync { + fn build_arrow_record_batch(&self) -> Result>; + + fn append(&mut self, row: &GenericRow) -> Result; + + fn append_batch(&mut self, record_batch: Arc) -> Result; + + fn schema(&self) -> SchemaRef; + + fn records_count(&self) -> i32; + + fn is_full(&self) -> bool; +} + +#[derive(Default)] +pub struct PrebuiltRecordBatchBuilder { + arrow_record_batch: Option>, + records_count: i32, +} + +impl ArrowRecordBatchInnerBuilder for PrebuiltRecordBatchBuilder { + fn build_arrow_record_batch(&self) -> Result> { + Ok(self.arrow_record_batch.as_ref().unwrap().clone()) + } + + fn append(&mut self, _row: &GenericRow) -> Result { + // append one single row is not supported, return false directly + Ok(false) + } + + fn append_batch(&mut self, record_batch: Arc) -> Result { + if self.arrow_record_batch.is_some() { + return Ok(false); + } + self.records_count = record_batch.num_rows() as i32; + self.arrow_record_batch = Some(record_batch); + Ok(true) + } + + fn schema(&self) -> SchemaRef { + self.arrow_record_batch.as_ref().unwrap().schema() + } + + fn records_count(&self) -> i32 { + self.records_count + } + + fn is_full(&self) -> bool { + // full if has one record batch + self.arrow_record_batch.is_some() + } +} + +pub struct RowAppendRecordBatchBuilder { table_schema: SchemaRef, - record_count: i32, arrow_column_builders: Mutex>>, - is_closed: bool, + records_count: i32, } -impl MemoryLogRecordsArrowBuilder { - pub fn new(schema_id: i32, row_type: &DataType) -> Self { +impl RowAppendRecordBatchBuilder { + pub fn new(row_type: &DataType) -> Self { let schema_ref = to_arrow_schema(row_type); let builders = Mutex::new( schema_ref @@ -111,32 +168,106 @@ impl MemoryLogRecordsArrowBuilder { .map(|field| Self::create_builder(field.data_type())) .collect(), ); + Self { + table_schema: schema_ref.clone(), + arrow_column_builders: builders, + records_count: 0, + } + } + + fn create_builder(data_type: &arrow_schema::DataType) -> Box { + match data_type { + arrow_schema::DataType::Int8 => Box::new(Int8Builder::new()), + arrow_schema::DataType::Int16 => Box::new(Int16Builder::new()), + arrow_schema::DataType::Int32 => Box::new(Int32Builder::new()), + arrow_schema::DataType::Int64 => Box::new(Int64Builder::new()), + arrow_schema::DataType::UInt8 => Box::new(UInt8Builder::new()), + arrow_schema::DataType::UInt16 => Box::new(UInt16Builder::new()), + arrow_schema::DataType::UInt32 => Box::new(UInt32Builder::new()), + arrow_schema::DataType::UInt64 => Box::new(UInt64Builder::new()), + arrow_schema::DataType::Float32 => Box::new(Float32Builder::new()), + arrow_schema::DataType::Float64 => Box::new(Float64Builder::new()), + arrow_schema::DataType::Boolean => Box::new(BooleanBuilder::new()), + arrow_schema::DataType::Utf8 => Box::new(StringBuilder::new()), + arrow_schema::DataType::Binary => Box::new(BinaryBuilder::new()), + dt => panic!("Unsupported data type: {dt:?}"), + } + } +} + +impl ArrowRecordBatchInnerBuilder for RowAppendRecordBatchBuilder { + fn build_arrow_record_batch(&self) -> Result> { + let arrays = self + .arrow_column_builders + .lock() + .iter_mut() + .map(|b| b.finish()) + .collect::>(); + Ok(Arc::new(RecordBatch::try_new( + self.table_schema.clone(), + arrays, + )?)) + } + + fn append(&mut self, row: &GenericRow) -> Result { + for (idx, value) in row.values.iter().enumerate() { + let mut builder_binding = self.arrow_column_builders.lock(); + let builder = builder_binding.get_mut(idx).unwrap(); + value.append_to(builder.as_mut())?; + } + self.records_count += 1; + Ok(true) + } + + fn append_batch(&mut self, _record_batch: Arc) -> Result { + Ok(false) + } + + fn schema(&self) -> SchemaRef { + self.table_schema.clone() + } + + fn records_count(&self) -> i32 { + self.records_count + } + + fn is_full(&self) -> bool { + self.records_count() >= DEFAULT_MAX_RECORD + } +} + +impl MemoryLogRecordsArrowBuilder { + pub fn new(schema_id: i32, row_type: &DataType, to_append_record_batch: bool) -> Self { + let arrow_batch_builder: Box = { + if to_append_record_batch { + Box::new(PrebuiltRecordBatchBuilder::default()) + } else { + Box::new(RowAppendRecordBatchBuilder::new(row_type)) + } + }; MemoryLogRecordsArrowBuilder { base_log_offset: BUILDER_DEFAULT_OFFSET, schema_id, magic: CURRENT_LOG_MAGIC_VALUE, writer_id: NO_WRITER_ID, batch_sequence: NO_BATCH_SEQUENCE, - record_count: 0, - table_schema: schema_ref, - arrow_column_builders: builders, is_closed: false, + arrow_record_batch_builder: arrow_batch_builder, } } - pub fn append(&mut self, row: &GenericRow) -> Result<()> { - for (idx, value) in row.values.iter().enumerate() { - let mut builder_binding = self.arrow_column_builders.lock(); - let builder = builder_binding.get_mut(idx).unwrap(); - value.append_to(builder.as_mut())?; + pub fn append(&mut self, record: &WriteRecord) -> Result { + match &record.row { + Record::Row(row) => Ok(self.arrow_record_batch_builder.append(row)?), + Record::RecordBatch(record_batch) => Ok(self + .arrow_record_batch_builder + .append_batch(record_batch.clone())?), } - self.record_count += 1; // todo: consider write other change type - Ok(()) } pub fn is_full(&self) -> bool { - self.record_count >= DEFAULT_MAX_RECORD + self.arrow_record_batch_builder.records_count() >= DEFAULT_MAX_RECORD } pub fn is_closed(&self) -> bool { @@ -150,18 +281,12 @@ impl MemoryLogRecordsArrowBuilder { pub fn build(&self) -> Result> { // serialize arrow batch let mut arrow_batch_bytes = vec![]; - let mut writer = StreamWriter::try_new(&mut arrow_batch_bytes, &self.table_schema)?; - - let arrays = self - .arrow_column_builders - .lock() - .iter_mut() - .map(|b| b.finish()) - .collect::>(); - let record_batch = RecordBatch::try_new(self.table_schema.clone(), arrays)?; + let table_schema = self.arrow_record_batch_builder.schema(); + let mut writer = StreamWriter::try_new(&mut arrow_batch_bytes, &table_schema)?; // get header len let header = writer.get_ref().len(); - writer.write(&record_batch)?; + let record_batch = self.arrow_record_batch_builder.build_arrow_record_batch()?; + writer.write(record_batch.as_ref())?; // get real arrow batch bytes let real_arrow_batch_bytes = &arrow_batch_bytes[header..]; @@ -195,39 +320,21 @@ impl MemoryLogRecordsArrowBuilder { cursor.write_u32::(0)?; // crc placeholder cursor.write_i16::(self.schema_id as i16)?; + let record_count = self.arrow_record_batch_builder.records_count(); // todo: curerntly, always is append only let append_only = true; cursor.write_u8(if append_only { 1 } else { 0 })?; - cursor.write_i32::(if self.record_count > 0 { - self.record_count - 1 + cursor.write_i32::(if record_count > 0 { + record_count - 1 } else { 0 })?; cursor.write_i64::(self.writer_id)?; cursor.write_i32::(self.batch_sequence)?; - cursor.write_i32::(self.record_count)?; + cursor.write_i32::(record_count)?; Ok(()) } - - fn create_builder(data_type: &arrow_schema::DataType) -> Box { - match data_type { - arrow_schema::DataType::Int8 => Box::new(Int8Builder::new()), - arrow_schema::DataType::Int16 => Box::new(Int16Builder::new()), - arrow_schema::DataType::Int32 => Box::new(Int32Builder::new()), - arrow_schema::DataType::Int64 => Box::new(Int64Builder::new()), - arrow_schema::DataType::UInt8 => Box::new(UInt8Builder::new()), - arrow_schema::DataType::UInt16 => Box::new(UInt16Builder::new()), - arrow_schema::DataType::UInt32 => Box::new(UInt32Builder::new()), - arrow_schema::DataType::UInt64 => Box::new(UInt64Builder::new()), - arrow_schema::DataType::Float32 => Box::new(Float32Builder::new()), - arrow_schema::DataType::Float64 => Box::new(Float64Builder::new()), - arrow_schema::DataType::Boolean => Box::new(BooleanBuilder::new()), - arrow_schema::DataType::Utf8 => Box::new(StringBuilder::new()), - arrow_schema::DataType::Binary => Box::new(BinaryBuilder::new()), - dt => panic!("Unsupported data type: {dt:?}"), - } - } } pub trait ToArrow { diff --git a/fluss-rust/crates/fluss/tests/integration/admin.rs b/fluss-rust/crates/fluss/tests/integration/admin.rs index 0d958a5656..c51373d2cb 100644 --- a/fluss-rust/crates/fluss/tests/integration/admin.rs +++ b/fluss-rust/crates/fluss/tests/integration/admin.rs @@ -38,20 +38,24 @@ mod admin_test { TablePath, }; use std::sync::Arc; + use std::thread; fn before_all() { // Create a new tokio runtime in a separate thread let cluster_guard = SHARED_FLUSS_CLUSTER.clone(); - std::thread::spawn(move || { + thread::spawn(move || { let rt = tokio::runtime::Runtime::new().expect("Failed to create runtime"); rt.block_on(async { - let cluster = FlussTestingClusterBuilder::new().build().await; + let cluster = FlussTestingClusterBuilder::new("test-admin").build().await; let mut guard = cluster_guard.write(); *guard = Some(cluster); }); }) .join() .expect("Failed to create cluster"); + // wait for 20 seconds to avoid the error like + // CoordinatorEventProcessor is not initialized yet + thread::sleep(std::time::Duration::from_secs(20)); } fn get_fluss_cluster() -> Arc { diff --git a/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs b/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs index 83a47956ae..e827e14932 100644 --- a/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs +++ b/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs @@ -28,13 +28,14 @@ use testcontainers::{ContainerAsync, GenericImage, ImageExt}; const FLUSS_VERSION: &str = "0.7.0"; pub struct FlussTestingClusterBuilder { - number_of_tablet_servers: usize, + number_of_tablet_servers: i32, network: &'static str, cluster_conf: HashMap, + testing_name: String, } impl FlussTestingClusterBuilder { - pub fn new() -> Self { + pub fn new(testing_name: impl Into) -> Self { // reduce testing resources let mut cluster_conf = HashMap::new(); cluster_conf.insert( @@ -50,14 +51,27 @@ impl FlussTestingClusterBuilder { number_of_tablet_servers: 1, cluster_conf, network: "fluss-cluster-network", + testing_name: testing_name.into(), } } + fn tablet_server_container_name(&self, server_id: i32) -> String { + format!("tablet-server-{}-{}", self.testing_name, server_id) + } + + fn coordinator_server_container_name(&self) -> String { + format!("coordinator-server-{}", self.testing_name) + } + + fn zookeeper_container_name(&self) -> String { + format!("zookeeper-{}", self.testing_name) + } + pub async fn build(&mut self) -> FlussTestingCluster { let zookeeper = Arc::new( GenericImage::new("zookeeper", "3.9.2") .with_network(self.network) - .with_container_name("zookeeper") + .with_container_name(self.zookeeper_container_name()) .start() .await .unwrap(), @@ -83,15 +97,25 @@ impl FlussTestingClusterBuilder { async fn start_coordinator_server(&mut self) -> ContainerAsync { let mut coordinator_confs = HashMap::new(); - coordinator_confs.insert("zookeeper.address", "zookeeper:2181"); + coordinator_confs.insert( + "zookeeper.address", + format!("{}:2181", self.zookeeper_container_name()), + ); coordinator_confs.insert( "bind.listeners", - "INTERNAL://coordinator-server:0, CLIENT://coordinator-server:9123", + format!( + "INTERNAL://{}:0, CLIENT://{}:9123", + self.coordinator_server_container_name(), + self.coordinator_server_container_name() + ), ); - coordinator_confs.insert("advertised.listeners", "CLIENT://localhost:9123"); - coordinator_confs.insert("internal.listener.name", "INTERNAL"); + coordinator_confs.insert( + "advertised.listeners", + "CLIENT://localhost:9123".to_string(), + ); + coordinator_confs.insert("internal.listener.name", "INTERNAL".to_string()); GenericImage::new("fluss/fluss", FLUSS_VERSION) - .with_container_name("coordinator-server") + .with_container_name(self.coordinator_server_container_name()) .with_mapped_port(9123, ContainerPort::Tcp(9123)) .with_network(self.network) .with_cmd(vec!["coordinatorServer"]) @@ -104,26 +128,30 @@ impl FlussTestingClusterBuilder { .unwrap() } - async fn start_tablet_server(&self, server_id: usize) -> ContainerAsync { + async fn start_tablet_server(&self, server_id: i32) -> ContainerAsync { let mut tablet_server_confs = HashMap::new(); let bind_listeners = format!( - "INTERNAL://tablet-server-{}:0, CLIENT://tablet-server-{}:9123", - server_id, server_id + "INTERNAL://{}:0, CLIENT://{}:9123", + self.tablet_server_container_name(server_id), + self.tablet_server_container_name(server_id), ); let expose_host_port = 9124 + server_id; let advertised_listeners = format!("CLIENT://localhost:{}", expose_host_port); let tablet_server_id = format!("{}", server_id); - tablet_server_confs.insert("zookeeper.address", "zookeeper:2181"); - tablet_server_confs.insert("bind.listeners", bind_listeners.as_str()); - tablet_server_confs.insert("advertised.listeners", advertised_listeners.as_str()); - tablet_server_confs.insert("internal.listener.name", "INTERNAL"); - tablet_server_confs.insert("tablet-server.id", tablet_server_id.as_str()); + tablet_server_confs.insert( + "zookeeper.address", + format!("{}:2181", self.zookeeper_container_name()), + ); + tablet_server_confs.insert("bind.listeners", bind_listeners); + tablet_server_confs.insert("advertised.listeners", advertised_listeners); + tablet_server_confs.insert("internal.listener.name", "INTERNAL".to_string()); + tablet_server_confs.insert("tablet-server.id", tablet_server_id); GenericImage::new("fluss/fluss", FLUSS_VERSION) .with_cmd(vec!["tabletServer"]) .with_mapped_port(expose_host_port as u16, ContainerPort::Tcp(9123)) .with_network(self.network) - .with_container_name(format!("tablet-server-{}", server_id)) + .with_container_name(self.tablet_server_container_name(server_id)) .with_env_var( "FLUSS_PROPERTIES", self.to_fluss_properties_with(tablet_server_confs), @@ -133,7 +161,7 @@ impl FlussTestingClusterBuilder { .unwrap() } - fn to_fluss_properties_with(&self, extra_properties: HashMap<&str, &str>) -> String { + fn to_fluss_properties_with(&self, extra_properties: HashMap<&str, String>) -> String { let mut fluss_properties = Vec::new(); for (k, v) in self.cluster_conf.iter() { fluss_properties.push(format!("{}: {}", k, v)); @@ -150,7 +178,7 @@ impl FlussTestingClusterBuilder { pub struct FlussTestingCluster { zookeeper: Arc>, coordinator_server: Arc>, - tablet_servers: HashMap>>, + tablet_servers: HashMap>>, bootstrap_servers: String, } @@ -165,6 +193,7 @@ impl FlussTestingCluster { pub async fn get_fluss_connection(&self) -> FlussConnection { let mut config = Config::default(); + config.writer_acks = "all".to_string(); config.bootstrap_server = Some(self.bootstrap_servers.clone()); // Retry mechanism: retry for up to 1 minute diff --git a/fluss-rust/crates/fluss/tests/integration/table.rs b/fluss-rust/crates/fluss/tests/integration/table.rs new file mode 100644 index 0000000000..a1a6cb288a --- /dev/null +++ b/fluss-rust/crates/fluss/tests/integration/table.rs @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use crate::integration::fluss_cluster::FlussTestingCluster; +use once_cell::sync::Lazy; +use parking_lot::RwLock; +use std::sync::Arc; + +#[cfg(test)] +use test_env_helpers::*; + +// Module-level shared cluster instance (only for this test file) +static SHARED_FLUSS_CLUSTER: Lazy>>> = + Lazy::new(|| Arc::new(RwLock::new(None))); + +#[cfg(test)] +#[before_all] +#[after_all] +mod table_test { + use super::SHARED_FLUSS_CLUSTER; + use crate::integration::fluss_cluster::{FlussTestingCluster, FlussTestingClusterBuilder}; + use crate::integration::utils::create_table; + use arrow::array::record_batch; + use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; + use std::sync::Arc; + use std::sync::atomic::AtomicUsize; + use std::thread; + fn before_all() { + // Create a new tokio runtime in a separate thread + let cluster_guard = SHARED_FLUSS_CLUSTER.clone(); + std::thread::spawn(move || { + let rt = tokio::runtime::Runtime::new().expect("Failed to create runtime"); + rt.block_on(async { + let cluster = FlussTestingClusterBuilder::new("test_table").build().await; + let mut guard = cluster_guard.write(); + *guard = Some(cluster); + }); + }) + .join() + .expect("Failed to create cluster"); + // wait for 20 seconds to avoid the error like + // CoordinatorEventProcessor is not initialized yet + thread::sleep(std::time::Duration::from_secs(20)); + } + + fn get_fluss_cluster() -> Arc { + let cluster_guard = SHARED_FLUSS_CLUSTER.read(); + if cluster_guard.is_none() { + panic!("Fluss cluster not initialized. Make sure before_all() was called."); + } + Arc::new(cluster_guard.as_ref().unwrap().clone()) + } + + fn after_all() { + // Create a new tokio runtime in a separate thread + let cluster_guard = SHARED_FLUSS_CLUSTER.clone(); + std::thread::spawn(move || { + let rt = tokio::runtime::Runtime::new().expect("Failed to create runtime"); + rt.block_on(async { + let mut guard = cluster_guard.write(); + if let Some(cluster) = guard.take() { + cluster.stop().await; + } + }); + }) + .join() + .expect("Failed to cleanup cluster"); + } + + #[tokio::test] + async fn append_record_batch() { + let cluster = get_fluss_cluster(); + let connection = cluster.get_fluss_connection().await; + + let admin = connection.get_admin().await.expect("Failed to get admin"); + + let table_path = + TablePath::new("fluss".to_string(), "test_append_record_batch".to_string()); + + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("c1", DataTypes::int()) + .column("c2", DataTypes::string()) + .build() + .expect("Failed to build schema"), + ) + .build() + .expect("Failed to build table"); + + create_table(&admin, &table_path, &table_descriptor).await; + + let append_writer = connection + .get_table(&table_path) + .await + .expect("Failed to get table") + .new_append() + .expect("Failed to create append") + .create_writer(); + + let batch1 = + record_batch!(("c1", Int32, [1, 2, 3]), ("c2", Utf8, ["a1", "a2", "a3"])).unwrap(); + append_writer + .append_arrow_batch(batch1) + .await + .expect("Failed to append batch"); + + let batch2 = + record_batch!(("c1", Int32, [4, 5, 6]), ("c2", Utf8, ["a4", "a5", "a6"])).unwrap(); + append_writer + .append_arrow_batch(batch2) + .await + .expect("Failed to append batch"); + + // todo: add scan code to verify the records appended in #30 + } +} diff --git a/fluss-rust/crates/fluss/tests/integration/utils.rs b/fluss-rust/crates/fluss/tests/integration/utils.rs new file mode 100644 index 0000000000..cd1f6ccb2d --- /dev/null +++ b/fluss-rust/crates/fluss/tests/integration/utils.rs @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +use fluss::client::FlussAdmin; +use fluss::metadata::{TableDescriptor, TablePath}; + +pub async fn create_table( + admin: &FlussAdmin, + table_path: &TablePath, + table_descriptor: &TableDescriptor, +) { + admin + .create_table(&table_path, &table_descriptor, false) + .await + .expect("Failed to create table"); +} diff --git a/fluss-rust/crates/fluss/tests/test_fluss.rs b/fluss-rust/crates/fluss/tests/test_fluss.rs index 28b9bef7d9..a15ca2395a 100644 --- a/fluss-rust/crates/fluss/tests/test_fluss.rs +++ b/fluss-rust/crates/fluss/tests/test_fluss.rs @@ -22,4 +22,7 @@ extern crate fluss; mod integration { mod admin; mod fluss_cluster; + mod table; + + mod utils; } From 073d41ba8ff0a1a07172d84138047ee98a9edbd7 Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Mon, 24 Nov 2025 10:40:02 +0800 Subject: [PATCH 017/287] [chore] Bump arrow version to 57 (#58) --- fluss-rust/Cargo.toml | 2 +- fluss-rust/bindings/python/Cargo.toml | 8 ++++-- fluss-rust/bindings/python/src/admin.rs | 6 ++-- fluss-rust/bindings/python/src/connection.rs | 12 ++++---- fluss-rust/bindings/python/src/metadata.rs | 6 ++-- fluss-rust/bindings/python/src/table.rs | 18 ++++++------ fluss-rust/bindings/python/src/utils.rs | 30 ++++++++++++-------- fluss-rust/crates/fluss/Cargo.toml | 2 +- 8 files changed, 46 insertions(+), 38 deletions(-) diff --git a/fluss-rust/Cargo.toml b/fluss-rust/Cargo.toml index 54436ac17d..e745d95e07 100644 --- a/fluss-rust/Cargo.toml +++ b/fluss-rust/Cargo.toml @@ -34,5 +34,5 @@ members = ["crates/fluss", "crates/examples", "bindings/python"] fluss = { version = "0.1.0", path = "./crates/fluss" } tokio = { version = "1.44.2", features = ["full"] } clap = { version = "4.5.37", features = ["derive"] } -arrow = "55.1.0" +arrow = "57.0.0" chrono = { version = "0.4", features = ["clock", "std", "wasmbind"] } diff --git a/fluss-rust/bindings/python/Cargo.toml b/fluss-rust/bindings/python/Cargo.toml index 04826fb289..9ecc6299cd 100644 --- a/fluss-rust/bindings/python/Cargo.toml +++ b/fluss-rust/bindings/python/Cargo.toml @@ -27,11 +27,13 @@ name = "fluss" crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.24", features = ["extension-module"] } +pyo3 = { version = "0.26.0", features = ["extension-module"] } fluss = { path = "../../crates/fluss" } tokio = { workspace = true } arrow = { workspace = true } -arrow-pyarrow = "55.1.0" -pyo3-async-runtimes = { version = "0.24.0", features = ["tokio-runtime"] } +arrow-pyarrow = "57.0.0" +arrow-schema = "57.0.0" +arrow-array = "57.0.0" +pyo3-async-runtimes = { version = "0.26.0", features = ["tokio-runtime"] } chrono = { workspace = true } once_cell = "1.21.3" diff --git a/fluss-rust/bindings/python/src/admin.rs b/fluss-rust/bindings/python/src/admin.rs index 73b2dd3af0..fa189eb80d 100644 --- a/fluss-rust/bindings/python/src/admin.rs +++ b/fluss-rust/bindings/python/src/admin.rs @@ -48,7 +48,7 @@ impl FlussAdmin { .await .map_err(|e| FlussError::new_err(e.to_string()))?; - Python::with_gil(|py| Ok(py.None())) + Python::attach(|py| Ok(py.None())) }) } @@ -67,7 +67,7 @@ impl FlussAdmin { .await .map_err(|e| FlussError::new_err(format!("Failed to get table: {e}")))?; - Python::with_gil(|py| { + Python::attach(|py| { let table_info = TableInfo::from_core(core_table_info); Py::new(py, table_info) }) @@ -89,7 +89,7 @@ impl FlussAdmin { .await .map_err(|e| FlussError::new_err(format!("Failed to get lake snapshot: {e}")))?; - Python::with_gil(|py| { + Python::attach(|py| { let lake_snapshot = LakeSnapshot::from_core(core_lake_snapshot); Py::new(py, lake_snapshot) }) diff --git a/fluss-rust/bindings/python/src/connection.rs b/fluss-rust/bindings/python/src/connection.rs index aeb8410ddf..a7559cec57 100644 --- a/fluss-rust/bindings/python/src/connection.rs +++ b/fluss-rust/bindings/python/src/connection.rs @@ -41,7 +41,7 @@ impl FlussConnection { inner: Arc::new(connection), }; - Python::with_gil(|py| Py::new(py, py_connection)) + Python::attach(|py| Py::new(py, py_connection)) }) } @@ -57,7 +57,7 @@ impl FlussConnection { let py_admin = FlussAdmin::from_core(admin); - Python::with_gil(|py| Py::new(py, py_admin)) + Python::attach(|py| Py::new(py, py_admin)) }) } @@ -84,7 +84,7 @@ impl FlussConnection { core_table.has_primary_key(), ); - Python::with_gil(|py| Py::new(py, py_table)) + Python::attach(|py| Py::new(py, py_table)) }) } @@ -102,9 +102,9 @@ impl FlussConnection { #[pyo3(signature = (_exc_type=None, _exc_value=None, _traceback=None))] fn __exit__( &mut self, - _exc_type: Option, - _exc_value: Option, - _traceback: Option, + _exc_type: Option>, + _exc_value: Option>, + _traceback: Option>, ) -> PyResult { self.close()?; Ok(false) diff --git a/fluss-rust/bindings/python/src/metadata.rs b/fluss-rust/bindings/python/src/metadata.rs index 66748ab316..bc5f288071 100644 --- a/fluss-rust/bindings/python/src/metadata.rs +++ b/fluss-rust/bindings/python/src/metadata.rs @@ -106,7 +106,7 @@ impl Schema { #[new] #[pyo3(signature = (schema, primary_keys=None))] pub fn new( - schema: PyObject, // PyArrow schema + schema: Py, // PyArrow schema primary_keys: Option>, ) -> PyResult { let arrow_schema = crate::utils::Utils::pyarrow_to_arrow_schema(&schema)?; @@ -553,7 +553,7 @@ impl LakeSnapshot { /// Get table bucket offsets as a Python dictionary with TableBucket keys #[getter] - pub fn table_buckets_offset(&self, py: Python) -> PyResult { + pub fn table_buckets_offset(&self, py: Python) -> PyResult> { let dict = PyDict::new(py); for (bucket, offset) in &self.table_buckets_offset { let py_bucket = TableBucket::from_core(bucket.clone()); @@ -569,7 +569,7 @@ impl LakeSnapshot { } /// Get all table buckets - pub fn get_table_buckets(&self, py: Python) -> PyResult> { + pub fn get_table_buckets(&self, py: Python) -> PyResult>> { let mut buckets = Vec::new(); for bucket in self.table_buckets_offset.keys() { let py_bucket = TableBucket::from_core(bucket.clone()); diff --git a/fluss-rust/bindings/python/src/table.rs b/fluss-rust/bindings/python/src/table.rs index c255fa6f8a..2a8df25c9d 100644 --- a/fluss-rust/bindings/python/src/table.rs +++ b/fluss-rust/bindings/python/src/table.rs @@ -51,7 +51,7 @@ impl FlussTable { let py_writer = AppendWriter::from_core(rust_writer); - Python::with_gil(|py| Py::new(py, py_writer)) + Python::attach(|py| Py::new(py, py_writer)) }) } @@ -75,7 +75,7 @@ impl FlussTable { .map_err(|e| FlussError::new_err(e.to_string()))?; let py_scanner = LogScanner::from_core(rust_scanner, admin, table_info.clone()); - Python::with_gil(|py| Py::new(py, py_scanner)) + Python::attach(|py| Py::new(py, py_scanner)) }) } @@ -131,10 +131,10 @@ pub struct AppendWriter { #[pymethods] impl AppendWriter { /// Write Arrow table data - pub fn write_arrow(&mut self, py: Python, table: PyObject) -> PyResult<()> { + pub fn write_arrow(&mut self, py: Python, table: Py) -> PyResult<()> { // Convert Arrow Table to batches and write each batch let batches = table.call_method0(py, "to_batches")?; - let batch_list: Vec = batches.extract(py)?; + let batch_list: Vec> = batches.extract(py)?; for batch in batch_list { self.write_arrow_batch(py, batch)?; @@ -143,7 +143,7 @@ impl AppendWriter { } /// Write Arrow batch data - pub fn write_arrow_batch(&mut self, py: Python, batch: PyObject) -> PyResult<()> { + pub fn write_arrow_batch(&mut self, py: Python, batch: Py) -> PyResult<()> { // Extract number of rows and columns from the Arrow batch let num_rows: usize = batch.getattr(py, "num_rows")?.extract(py)?; let num_columns: usize = batch.getattr(py, "num_columns")?.extract(py)?; @@ -175,7 +175,7 @@ impl AppendWriter { } /// Write Pandas DataFrame data - pub fn write_pandas(&mut self, py: Python, df: PyObject) -> PyResult<()> { + pub fn write_pandas(&mut self, py: Python, df: Py) -> PyResult<()> { // Import pyarrow module let pyarrow = py.import("pyarrow")?; @@ -213,7 +213,7 @@ impl AppendWriter { fn convert_python_value_to_datum( &self, py: Python, - value: PyObject, + value: Py, ) -> PyResult> { use fcore::row::{Blob, Datum, F32, F64}; @@ -321,7 +321,7 @@ impl LogScanner { } /// Convert all data to Arrow Table - fn to_arrow(&self, py: Python) -> PyResult { + fn to_arrow(&self, py: Python) -> PyResult> { use std::collections::HashMap; use std::time::Duration; @@ -387,7 +387,7 @@ impl LogScanner { } /// Convert all data to Pandas DataFrame - fn to_pandas(&self, py: Python) -> PyResult { + fn to_pandas(&self, py: Python) -> PyResult> { let arrow_table = self.to_arrow(py)?; // Convert Arrow Table to Pandas DataFrame using pyarrow diff --git a/fluss-rust/bindings/python/src/utils.rs b/fluss-rust/bindings/python/src/utils.rs index 93933b3774..09e6b5f589 100644 --- a/fluss-rust/bindings/python/src/utils.rs +++ b/fluss-rust/bindings/python/src/utils.rs @@ -16,8 +16,8 @@ // under the License. use crate::*; -use arrow::datatypes::{Schema as ArrowSchema, SchemaRef}; -use arrow_pyarrow::ToPyArrow; +use arrow_pyarrow::{FromPyArrow, ToPyArrow}; +use arrow_schema::SchemaRef; use std::sync::Arc; /// Utilities for schema conversion between PyArrow, Arrow, and Fluss @@ -25,11 +25,10 @@ pub struct Utils; impl Utils { /// Convert PyArrow schema to Rust Arrow schema - pub fn pyarrow_to_arrow_schema(py_schema: &PyObject) -> PyResult { - Python::with_gil(|py| { + pub fn pyarrow_to_arrow_schema(py_schema: &Py) -> PyResult { + Python::attach(|py| { let schema_bound = py_schema.bind(py); - - let schema: ArrowSchema = arrow_pyarrow::FromPyArrow::from_pyarrow_bound(schema_bound) + let schema: arrow_schema::Schema = FromPyArrow::from_pyarrow_bound(schema_bound) .map_err(|e| { FlussError::new_err(format!("Failed to convert PyArrow schema: {e}")) })?; @@ -172,14 +171,21 @@ impl Utils { pub fn combine_batches_to_table( py: Python, batches: Vec>, - ) -> PyResult { - // Convert Rust Arrow RecordBatch to PyObject - let py_batches: Result, _> = batches + ) -> PyResult> { + use arrow_array::RecordBatch as ArrowArrayRecordBatch; + + let py_batches: Result>, _> = batches .iter() .map(|batch| { - batch.as_ref().to_pyarrow(py).map_err(|e| { - FlussError::new_err(format!("Failed to convert RecordBatch to PyObject: {e}")) - }) + ArrowArrayRecordBatch::try_new(batch.schema().clone(), batch.columns().to_vec()) + .map_err(|e| FlussError::new_err(format!("Failed to convert RecordBatch: {e}"))) + .and_then(|b| { + ToPyArrow::to_pyarrow(&b, py) + .map(|x| x.into()) + .map_err(|e| { + FlussError::new_err(format!("Failed to convert to PyObject: {e}")) + }) + }) }) .collect(); diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index ab1efc26d2..af770377ce 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -24,7 +24,7 @@ build = "src/build.rs" [dependencies] arrow = { workspace = true } -arrow-schema = "55.1.0" +arrow-schema = "57.0.0" byteorder = "1.5" futures = "0.3" clap = { workspace = true } From 6ea79746e39bf80443fe9e590cee067d3df795b7 Mon Sep 17 00:00:00 2001 From: Pavlos-Petros Tournaris Date: Mon, 24 Nov 2025 04:48:37 +0200 Subject: [PATCH 018/287] chore: add integration test for scan records after append (#51) --- .../crates/fluss/tests/integration/table.rs | 50 ++++++++++++++++++- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/fluss-rust/crates/fluss/tests/integration/table.rs b/fluss-rust/crates/fluss/tests/integration/table.rs index a1a6cb288a..aa02724715 100644 --- a/fluss-rust/crates/fluss/tests/integration/table.rs +++ b/fluss-rust/crates/fluss/tests/integration/table.rs @@ -36,7 +36,8 @@ mod table_test { use crate::integration::fluss_cluster::{FlussTestingCluster, FlussTestingClusterBuilder}; use crate::integration::utils::create_table; use arrow::array::record_batch; - use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; + use fluss::metadata::{DataTypes, Schema, TableBucket, TableDescriptor, TablePath}; + use fluss::row::InternalRow; use std::sync::Arc; use std::sync::atomic::AtomicUsize; use std::thread; @@ -127,6 +128,51 @@ mod table_test { .await .expect("Failed to append batch"); - // todo: add scan code to verify the records appended in #30 + // Create scanner to verify appended records + let table = connection + .get_table(&table_path) + .await + .expect("Failed to get table"); + + let table_scan = table.new_scan(); + let log_scanner = table_scan.create_log_scanner(); + + // Subscribe to bucket 0 starting from offset 0 + log_scanner + .subscribe(0, 0) + .await + .expect("Failed to subscribe to bucket"); + + // Poll for records + let scan_records = log_scanner + .poll(tokio::time::Duration::from_secs(5)) + .await + .expect("Failed to poll records"); + + // Verify the scanned records + let table_bucket = TableBucket::new(table.table_info().table_id, 0); + let records = scan_records.records(&table_bucket); + + assert_eq!(records.len(), 6, "Expected 6 records"); + + // Verify record contents match what was appended + let expected_c1_values = vec![1, 2, 3, 4, 5, 6]; + let expected_c2_values = vec!["a1", "a2", "a3", "a4", "a5", "a6"]; + + for (i, record) in records.iter().enumerate() { + let row = record.row(); + assert_eq!( + row.get_int(0), + expected_c1_values[i], + "c1 value mismatch at row {}", + i + ); + assert_eq!( + row.get_string(1), + expected_c2_values[i], + "c2 value mismatch at row {}", + i + ); + } } } From fdbe7f4acab5dceb0369365214fbdc4194c57fa2 Mon Sep 17 00:00:00 2001 From: Evan Date: Tue, 25 Nov 2025 02:39:23 +0100 Subject: [PATCH 019/287] [chore] Add ipc_compression feature to arrow dependency (#59) --- fluss-rust/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fluss-rust/Cargo.toml b/fluss-rust/Cargo.toml index e745d95e07..15bcb796fd 100644 --- a/fluss-rust/Cargo.toml +++ b/fluss-rust/Cargo.toml @@ -34,5 +34,5 @@ members = ["crates/fluss", "crates/examples", "bindings/python"] fluss = { version = "0.1.0", path = "./crates/fluss" } tokio = { version = "1.44.2", features = ["full"] } clap = { version = "4.5.37", features = ["derive"] } -arrow = "57.0.0" +arrow = { version = "57.0.0", features = ["ipc_compression"] } chrono = { version = "0.4", features = ["clock", "std", "wasmbind"] } From a62d9fe428ae72b2b6fc465e9fb2f6790e6cbe22 Mon Sep 17 00:00:00 2001 From: Evan Date: Tue, 25 Nov 2025 13:21:59 +0100 Subject: [PATCH 020/287] [chore] Fix build fail on mac os (#61) --- fluss-rust/.github/.DS_Store | Bin 6148 -> 0 bytes fluss-rust/.github/workflows/ci.yml | 55 ++++++++++++++++++++++------ 2 files changed, 44 insertions(+), 11 deletions(-) delete mode 100644 fluss-rust/.github/.DS_Store diff --git a/fluss-rust/.github/.DS_Store b/fluss-rust/.github/.DS_Store deleted file mode 100644 index 7adc49df07876bc71be2a10c09a4f4452106ddaf..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKQA+|r5S~@b6DfQs@bREmAuAK2m+R7ppddP{hYGtxsiW&w^vnkl?z#R@zog&O z%PUs5zR@h88s8 z1WxCo+3_D4ptD$i!>YK=9b4$y4D{mF8 z8}D9@yu?q21K00gQLiOs6ijVDxCpzWc4_TU#)%)s-M&f)J6#O9z6|4z9Jz8BcLplg z(*st)Dzr=G@p!jdt=P4CZBntvd)s91)F+dIwZ5^rf7X789%K2ei52)CRI*}l3NL8< zxW^~2A4f92!?@`j(_BVofEi#07LfsS*IC6y+#K(T8DIu}#Q@z8HY%ZGFx9B84s7W2 zk;Zd`BxuuHg3vbT7)&+d2#V0Dh&ol6Cx+1J=(kOrV=&dI(?O_}aUQd>FfSCLR!6_B z!a+D1xn%~JfmsH!x?7_9-~9glKbyooW`G%3C`lBun8o12 diff --git a/fluss-rust/.github/workflows/ci.yml b/fluss-rust/.github/workflows/ci.yml index 69625f8f2e..cf7a126486 100644 --- a/fluss-rust/.github/workflows/ci.yml +++ b/fluss-rust/.github/workflows/ci.yml @@ -54,17 +54,34 @@ jobs: os: - ubuntu-latest - macos-latest + python: ["3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Install protoc run: | - if [ "$RUNNER_OS" == "Linux" ]; then + if [ "$RUNNER_OS" = "Linux" ]; then sudo apt-get update && sudo apt-get install -y protobuf-compiler - elif [ "$RUNNER_OS" == "macOS" ]; then + elif [ "$RUNNER_OS" = "macOS" ]; then brew install protobuf fi + + - name: Rust Cache + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: build-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('**/Cargo.lock') }} + - name: Build - run: cargo build + run: cargo build --workspace --all-targets test: runs-on: ${{ matrix.os }} @@ -73,26 +90,42 @@ jobs: os: - ubuntu-latest - macos-latest + python: ["3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Install protoc run: | - if [ "$RUNNER_OS" == "Linux" ]; then + if [ "$RUNNER_OS" = "Linux" ]; then sudo apt-get update && sudo apt-get install -y protobuf-compiler - elif [ "$RUNNER_OS" == "macOS" ]; then + elif [ "$RUNNER_OS" = "macOS" ]; then brew install protobuf fi + + - name: Rust Cache + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: test-${{ runner.os }}-${{ matrix.python }}-${{ hashFiles('**/Cargo.lock') }} + - name: Unit Test run: cargo test --all-targets --workspace env: RUST_LOG: DEBUG RUST_BACKTRACE: full - - name: Integration Test - # only run IT in linux since no docker in macos by default + + - name: Integration Test (Linux only) + if: runner.os == 'Linux' run: | - if [ "$RUNNER_OS" == "Linux" ]; then - RUST_TEST_THREADS=1 cargo test --features integration_tests --all-targets --workspace -- --nocapture - fi + RUST_TEST_THREADS=1 cargo test --features integration_tests --all-targets --workspace -- --nocapture env: RUST_LOG: DEBUG - RUST_BACKTRACE: full \ No newline at end of file + RUST_BACKTRACE: full From 121a03606f7acb8cbd3a61b8a65f4d9eb55caf62 Mon Sep 17 00:00:00 2001 From: Yang Guo <100583615+gyang94@users.noreply.github.com> Date: Sun, 30 Nov 2025 00:15:25 +0800 Subject: [PATCH 021/287] feat: implement get_long for GenericRow (#49) --- .../crates/examples/src/example_table.rs | 10 +++++++--- fluss-rust/crates/fluss/src/row/datum.rs | 19 +++++++++++++++++++ fluss-rust/crates/fluss/src/row/mod.rs | 2 +- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/fluss-rust/crates/examples/src/example_table.rs b/fluss-rust/crates/examples/src/example_table.rs index 3eb8dd867f..deab3639da 100644 --- a/fluss-rust/crates/examples/src/example_table.rs +++ b/fluss-rust/crates/examples/src/example_table.rs @@ -27,7 +27,7 @@ use tokio::try_join; #[tokio::main] pub async fn main() -> Result<()> { let mut config = Config::parse(); - config.bootstrap_server = Some("127.0.0.1:56405".to_string()); + config.bootstrap_server = Some("127.0.0.1:9123".to_string()); let conn = FlussConnection::new(config).await?; @@ -36,11 +36,12 @@ pub async fn main() -> Result<()> { Schema::builder() .column("c1", DataTypes::int()) .column("c2", DataTypes::string()) + .column("c3", DataTypes::bigint()) .build()?, ) .build()?; - let table_path = TablePath::new("fluss".to_owned(), "rust_test".to_owned()); + let table_path = TablePath::new("fluss".to_owned(), "rust_test_long".to_owned()); let admin = conn.get_admin().await?; @@ -56,6 +57,7 @@ pub async fn main() -> Result<()> { let mut row = GenericRow::new(); row.set_field(0, 22222); row.set_field(1, "t2t"); + row.set_field(2, 123_456_789_123i64); let table = conn.get_table(&table_path).await?; let append_writer = table.new_append()?.create_writer(); @@ -63,6 +65,7 @@ pub async fn main() -> Result<()> { row = GenericRow::new(); row.set_field(0, 233333); row.set_field(1, "tt44"); + row.set_field(2, 987_654_321_987i64); let f2 = append_writer.append(row); try_join!(f1, f2, append_writer.flush())?; @@ -76,9 +79,10 @@ pub async fn main() -> Result<()> { for record in scan_records { let row = record.row(); println!( - "{{{}, {}}}@{}", + "{{{}, {}, {}}}@{}", row.get_int(0), row.get_string(1), + row.get_long(2), record.offset() ); } diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs index d8c4f748ca..ed33b8badf 100644 --- a/fluss-rust/crates/fluss/src/row/datum.rs +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -88,6 +88,13 @@ impl<'a> From for Datum<'a> { } } +impl<'a> From for Datum<'a> { + #[inline] + fn from(i: i64) -> Datum<'a> { + Datum::Int64(i) + } +} + impl<'a> From<&'a str> for Datum<'a> { #[inline] fn from(s: &'a str) -> Datum<'a> { @@ -127,6 +134,18 @@ impl TryFrom<&Datum<'_>> for i32 { } } +impl TryFrom<&Datum<'_>> for i64 { + type Error = (); + + #[inline] + fn try_from(from: &Datum) -> std::result::Result { + match from { + Datum::Int64(i) => Ok(*i), + _ => Err(()), + } + } +} + impl<'a> TryFrom<&Datum<'a>> for &'a str { type Error = (); diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index b900cb51d5..aa2c41159e 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -102,7 +102,7 @@ impl<'a> InternalRow for GenericRow<'a> { } fn get_long(&self, _pos: usize) -> i64 { - todo!() + self.values.get(_pos).unwrap().try_into().unwrap() } fn get_float(&self, _pos: usize) -> f32 { From 236ae5af44f6f58ff3f9a4a4a19863c9516f0b80 Mon Sep 17 00:00:00 2001 From: Pavlos-Petros Tournaris Date: Sat, 29 Nov 2025 18:16:27 +0200 Subject: [PATCH 022/287] feat: implement Display trait for DataType and related types (#50) Fixes #38 --- .../crates/fluss/src/metadata/datatype.rs | 474 +++++++++++++++++- fluss-rust/rust-toolchain.toml | 2 +- 2 files changed, 474 insertions(+), 2 deletions(-) diff --git a/fluss-rust/crates/fluss/src/metadata/datatype.rs b/fluss-rust/crates/fluss/src/metadata/datatype.rs index 09ca0c2c57..c7f93264df 100644 --- a/fluss-rust/crates/fluss/src/metadata/datatype.rs +++ b/fluss-rust/crates/fluss/src/metadata/datatype.rs @@ -93,6 +93,32 @@ impl DataType { } } +impl Display for DataType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + DataType::Boolean(v) => write!(f, "{}", v), + DataType::TinyInt(v) => write!(f, "{}", v), + DataType::SmallInt(v) => write!(f, "{}", v), + DataType::Int(v) => write!(f, "{}", v), + DataType::BigInt(v) => write!(f, "{}", v), + DataType::Float(v) => write!(f, "{}", v), + DataType::Double(v) => write!(f, "{}", v), + DataType::Char(v) => write!(f, "{}", v), + DataType::String(v) => write!(f, "{}", v), + DataType::Decimal(v) => write!(f, "{}", v), + DataType::Date(v) => write!(f, "{}", v), + DataType::Time(v) => write!(f, "{}", v), + DataType::Timestamp(v) => write!(f, "{}", v), + DataType::TimestampLTz(v) => write!(f, "{}", v), + DataType::Bytes(v) => write!(f, "{}", v), + DataType::Binary(v) => write!(f, "{}", v), + DataType::Array(v) => write!(f, "{}", v), + DataType::Map(v) => write!(f, "{}", v), + DataType::Row(v) => write!(f, "{}", v), + } + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct BooleanType { nullable: bool, @@ -118,6 +144,16 @@ impl BooleanType { } } +impl Display for BooleanType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "BOOLEAN")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct TinyIntType { nullable: bool, @@ -143,6 +179,16 @@ impl TinyIntType { } } +impl Display for TinyIntType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "TINYINT")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct SmallIntType { nullable: bool, @@ -168,6 +214,16 @@ impl SmallIntType { } } +impl Display for SmallIntType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "SMALLINT")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct IntType { nullable: bool, @@ -193,6 +249,16 @@ impl IntType { } } +impl Display for IntType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "INT")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct BigIntType { nullable: bool, @@ -218,6 +284,16 @@ impl BigIntType { } } +impl Display for BigIntType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "BIGINT")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct FloatType { nullable: bool, @@ -243,6 +319,16 @@ impl FloatType { } } +impl Display for FloatType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "FLOAT")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct DoubleType { nullable: bool, @@ -268,6 +354,16 @@ impl DoubleType { } } +impl Display for DoubleType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "DOUBLE")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct CharType { nullable: bool, @@ -327,6 +423,16 @@ impl StringType { } } +impl Display for StringType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "STRING")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct DecimalType { nullable: bool, @@ -370,6 +476,16 @@ impl DecimalType { } } +impl Display for DecimalType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "DECIMAL({}, {})", self.precision, self.scale)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct DateType { nullable: bool, @@ -395,6 +511,16 @@ impl DateType { } } +impl Display for DateType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "DATE")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Default, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct TimeType { nullable: bool, @@ -434,6 +560,16 @@ impl TimeType { } } +impl Display for TimeType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "TIME({})", self.precision)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct TimestampType { nullable: bool, @@ -473,6 +609,16 @@ impl TimestampType { } } +impl Display for TimestampType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "TIMESTAMP({})", self.precision)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct TimestampLTzType { nullable: bool, @@ -512,6 +658,16 @@ impl TimestampLTzType { } } +impl Display for TimestampLTzType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "TIMESTAMP_LTZ({})", self.precision)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct BytesType { nullable: bool, @@ -537,6 +693,16 @@ impl BytesType { } } +impl Display for BytesType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "BYTES")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub struct BinaryType { nullable: bool, @@ -567,6 +733,16 @@ impl BinaryType { } } +impl Display for BinaryType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "BINARY({})", self.length)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct ArrayType { nullable: bool, @@ -597,6 +773,16 @@ impl ArrayType { } } +impl Display for ArrayType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "ARRAY<{}>", self.element_type)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct MapType { nullable: bool, @@ -634,6 +820,16 @@ impl MapType { } } +impl Display for MapType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "MAP<{}, {}>", self.key_type, self.value_type)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] pub struct RowType { nullable: bool, @@ -658,6 +854,23 @@ impl RowType { } } +impl Display for RowType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "ROW<")?; + for (i, field) in self.fields.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{}", field)?; + } + write!(f, ">")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + pub struct DataTypes; impl DataTypes { @@ -823,4 +1036,263 @@ impl DataField { } } -// todo: implement display for datatype +impl Display for DataField { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{} {}", self.name, self.data_type) + } +} + +#[test] +fn test_boolean_display() { + assert_eq!(BooleanType::new().to_string(), "BOOLEAN"); + assert_eq!( + BooleanType::with_nullable(false).to_string(), + "BOOLEAN NOT NULL" + ); +} + +#[test] +fn test_tinyint_display() { + assert_eq!(TinyIntType::new().to_string(), "TINYINT"); + assert_eq!( + TinyIntType::with_nullable(false).to_string(), + "TINYINT NOT NULL" + ); +} + +#[test] +fn test_smallint_display() { + assert_eq!(SmallIntType::new().to_string(), "SMALLINT"); + assert_eq!( + SmallIntType::with_nullable(false).to_string(), + "SMALLINT NOT NULL" + ); +} + +#[test] +fn test_int_display() { + assert_eq!(IntType::new().to_string(), "INT"); + assert_eq!(IntType::with_nullable(false).to_string(), "INT NOT NULL"); +} + +#[test] +fn test_bigint_display() { + assert_eq!(BigIntType::new().to_string(), "BIGINT"); + assert_eq!( + BigIntType::with_nullable(false).to_string(), + "BIGINT NOT NULL" + ); +} + +#[test] +fn test_float_display() { + assert_eq!(FloatType::new().to_string(), "FLOAT"); + assert_eq!( + FloatType::with_nullable(false).to_string(), + "FLOAT NOT NULL" + ); +} + +#[test] +fn test_double_display() { + assert_eq!(DoubleType::new().to_string(), "DOUBLE"); + assert_eq!( + DoubleType::with_nullable(false).to_string(), + "DOUBLE NOT NULL" + ); +} + +#[test] +fn test_string_display() { + assert_eq!(StringType::new().to_string(), "STRING"); + assert_eq!( + StringType::with_nullable(false).to_string(), + "STRING NOT NULL" + ); +} + +#[test] +fn test_date_display() { + assert_eq!(DateType::new().to_string(), "DATE"); + assert_eq!(DateType::with_nullable(false).to_string(), "DATE NOT NULL"); +} + +#[test] +fn test_bytes_display() { + assert_eq!(BytesType::new().to_string(), "BYTES"); + assert_eq!( + BytesType::with_nullable(false).to_string(), + "BYTES NOT NULL" + ); +} + +#[test] +fn test_char_display() { + assert_eq!(CharType::new(10).to_string(), "CHAR(10)"); + assert_eq!( + CharType::with_nullable(20, false).to_string(), + "CHAR(20) NOT NULL" + ); +} + +#[test] +fn test_decimal_display() { + assert_eq!(DecimalType::new(10, 2).to_string(), "DECIMAL(10, 2)"); + assert_eq!( + DecimalType::with_nullable(false, 38, 10).to_string(), + "DECIMAL(38, 10) NOT NULL" + ); +} + +#[test] +fn test_time_display() { + assert_eq!(TimeType::new(0).to_string(), "TIME(0)"); + assert_eq!(TimeType::new(3).to_string(), "TIME(3)"); + assert_eq!( + TimeType::with_nullable(false, 9).to_string(), + "TIME(9) NOT NULL" + ); +} + +#[test] +fn test_timestamp_display() { + assert_eq!(TimestampType::new(6).to_string(), "TIMESTAMP(6)"); + assert_eq!(TimestampType::new(0).to_string(), "TIMESTAMP(0)"); + assert_eq!( + TimestampType::with_nullable(false, 9).to_string(), + "TIMESTAMP(9) NOT NULL" + ); +} + +#[test] +fn test_timestamp_ltz_display() { + assert_eq!(TimestampLTzType::new(6).to_string(), "TIMESTAMP_LTZ(6)"); + assert_eq!(TimestampLTzType::new(3).to_string(), "TIMESTAMP_LTZ(3)"); + assert_eq!( + TimestampLTzType::with_nullable(false, 9).to_string(), + "TIMESTAMP_LTZ(9) NOT NULL" + ); +} + +#[test] +fn test_binary_display() { + assert_eq!(BinaryType::new(100).to_string(), "BINARY(100)"); + assert_eq!( + BinaryType::with_nullable(false, 256).to_string(), + "BINARY(256) NOT NULL" + ); +} + +#[test] +fn test_array_display() { + let array_type = ArrayType::new(DataTypes::int()); + assert_eq!(array_type.to_string(), "ARRAY"); + + let array_type_non_null = ArrayType::with_nullable(false, DataTypes::string()); + assert_eq!(array_type_non_null.to_string(), "ARRAY NOT NULL"); + + let nested_array = ArrayType::new(DataTypes::array(DataTypes::int())); + assert_eq!(nested_array.to_string(), "ARRAY>"); +} + +#[test] +fn test_map_display() { + let map_type = MapType::new(DataTypes::string(), DataTypes::int()); + assert_eq!(map_type.to_string(), "MAP"); + + let map_type_non_null = + MapType::with_nullable(false, DataTypes::int(), DataTypes::string()); + assert_eq!(map_type_non_null.to_string(), "MAP NOT NULL"); + + let nested_map = MapType::new( + DataTypes::string(), + DataTypes::map(DataTypes::int(), DataTypes::boolean()), + ); + assert_eq!(nested_map.to_string(), "MAP>"); +} + +#[test] +fn test_row_display() { + let fields = vec![ + DataTypes::field("id".to_string(), DataTypes::int()), + DataTypes::field("name".to_string(), DataTypes::string()), + ]; + let row_type = RowType::new(fields); + assert_eq!(row_type.to_string(), "ROW"); + + let fields_non_null = vec![DataTypes::field("age".to_string(), DataTypes::bigint())]; + let row_type_non_null = RowType::with_nullable(false, fields_non_null); + assert_eq!(row_type_non_null.to_string(), "ROW NOT NULL"); +} + +#[test] +fn test_datatype_display() { + assert_eq!(DataTypes::boolean().to_string(), "BOOLEAN"); + assert_eq!(DataTypes::int().to_string(), "INT"); + assert_eq!(DataTypes::string().to_string(), "STRING"); + assert_eq!(DataTypes::char(50).to_string(), "CHAR(50)"); + assert_eq!(DataTypes::decimal(10, 2).to_string(), "DECIMAL(10, 2)"); + assert_eq!(DataTypes::time_with_precision(3).to_string(), "TIME(3)"); + assert_eq!( + DataTypes::timestamp_with_precision(6).to_string(), + "TIMESTAMP(6)" + ); + assert_eq!( + DataTypes::timestamp_ltz_with_precision(9).to_string(), + "TIMESTAMP_LTZ(9)" + ); + assert_eq!(DataTypes::array(DataTypes::int()).to_string(), "ARRAY"); + assert_eq!( + DataTypes::map(DataTypes::string(), DataTypes::int()).to_string(), + "MAP" + ); +} + +#[test] +fn test_datafield_display() { + let field = DataTypes::field("user_id".to_string(), DataTypes::bigint()); + assert_eq!(field.to_string(), "user_id BIGINT"); + + let field2 = DataTypes::field("email".to_string(), DataTypes::string()); + assert_eq!(field2.to_string(), "email STRING"); + + let field3 = DataTypes::field("score".to_string(), DataTypes::decimal(10, 2)); + assert_eq!(field3.to_string(), "score DECIMAL(10, 2)"); +} + +#[test] +fn test_complex_nested_display() { + let row_type = DataTypes::row(vec![ + DataTypes::field("id".to_string(), DataTypes::int()), + DataTypes::field("tags".to_string(), DataTypes::array(DataTypes::string())), + DataTypes::field( + "metadata".to_string(), + DataTypes::map(DataTypes::string(), DataTypes::string()), + ), + ]); + assert_eq!( + row_type.to_string(), + "ROW, metadata MAP>" + ); +} + +#[test] +fn test_non_nullable_datatype() { + let nullable_int = DataTypes::int(); + assert_eq!(nullable_int.to_string(), "INT"); + + let non_nullable_int = nullable_int.as_non_nullable(); + assert_eq!(non_nullable_int.to_string(), "INT NOT NULL"); +} + +#[test] +fn test_deeply_nested_types() { + let nested = DataTypes::array(DataTypes::map( + DataTypes::string(), + DataTypes::row(vec![ + DataTypes::field("x".to_string(), DataTypes::int()), + DataTypes::field("y".to_string(), DataTypes::int()), + ]), + )); + assert_eq!(nested.to_string(), "ARRAY>>"); +} diff --git a/fluss-rust/rust-toolchain.toml b/fluss-rust/rust-toolchain.toml index 56c3bf5df8..870d7eb7af 100644 --- a/fluss-rust/rust-toolchain.toml +++ b/fluss-rust/rust-toolchain.toml @@ -17,4 +17,4 @@ [toolchain] channel = "stable" -components = ["rustfmt", "clippy"] \ No newline at end of file +components = ["rustfmt", "clippy"] From 3d3de438e20b1cc19dc84c2227d91e7dda05ed0f Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Sun, 30 Nov 2025 00:27:38 +0800 Subject: [PATCH 023/287] chore: fix ci (#69) --- fluss-rust/crates/fluss/src/metadata/datatype.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fluss-rust/crates/fluss/src/metadata/datatype.rs b/fluss-rust/crates/fluss/src/metadata/datatype.rs index c7f93264df..4deed2bce0 100644 --- a/fluss-rust/crates/fluss/src/metadata/datatype.rs +++ b/fluss-rust/crates/fluss/src/metadata/datatype.rs @@ -1200,8 +1200,7 @@ fn test_map_display() { let map_type = MapType::new(DataTypes::string(), DataTypes::int()); assert_eq!(map_type.to_string(), "MAP"); - let map_type_non_null = - MapType::with_nullable(false, DataTypes::int(), DataTypes::string()); + let map_type_non_null = MapType::with_nullable(false, DataTypes::int(), DataTypes::string()); assert_eq!(map_type_non_null.to_string(), "MAP NOT NULL"); let nested_map = MapType::new( From 0f8ff0b50db49be897bca90785fe6f14c2683995 Mon Sep 17 00:00:00 2001 From: AlexZhao Date: Thu, 4 Dec 2025 10:58:56 +0800 Subject: [PATCH 024/287] feat: support ColumnPruning (#57) --- fluss-rust/crates/examples/Cargo.toml | 2 - .../crates/fluss/src/client/table/scanner.rs | 115 ++++++++- fluss-rust/crates/fluss/src/record/arrow.rs | 239 +++++++++++++++--- .../crates/fluss/tests/integration/table.rs | 87 ++++++- 4 files changed, 390 insertions(+), 53 deletions(-) diff --git a/fluss-rust/crates/examples/Cargo.toml b/fluss-rust/crates/examples/Cargo.toml index 82d864f818..dab85b66ed 100644 --- a/fluss-rust/crates/examples/Cargo.toml +++ b/fluss-rust/crates/examples/Cargo.toml @@ -27,8 +27,6 @@ version = { workspace = true } fluss = { workspace = true } tokio = { workspace = true } clap = { workspace = true} - - [[example]] name = "example-table" path = "src/example_table.rs" \ No newline at end of file diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index e1ab59ffbb..13372efecf 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -17,12 +17,13 @@ use crate::client::connection::FlussConnection; use crate::client::metadata::Metadata; -use crate::error::Result; +use crate::error::{Error, Result}; use crate::metadata::{TableBucket, TableInfo, TablePath}; use crate::proto::{FetchLogRequest, PbFetchLogReqForBucket, PbFetchLogReqForTable}; use crate::record::{LogRecordsBatchs, ReadContext, ScanRecord, ScanRecords, to_arrow_schema}; use crate::rpc::RpcClient; use crate::util::FairBucketStatusMap; +use arrow_schema::SchemaRef; use parking_lot::RwLock; use std::collections::HashMap; use std::slice::from_ref; @@ -39,6 +40,8 @@ pub struct TableScan<'a> { conn: &'a FlussConnection, table_info: TableInfo, metadata: Arc, + /// Column indices to project. None means all columns, Some(vec) means only the specified columns (non-empty). + projected_fields: Option>, } impl<'a> TableScan<'a> { @@ -47,14 +50,82 @@ impl<'a> TableScan<'a> { conn, table_info, metadata, + projected_fields: None, } } - pub fn create_log_scanner(&self) -> LogScanner { + /// Projects the scan to only include specified columns by their indices. + /// + /// # Arguments + /// * `column_indices` - Zero-based indices of columns to include in the scan + /// + /// # Errors + /// Returns an error if `column_indices` is empty or if any column index is out of range. + /// + /// # Example + /// ``` + /// let scanner = table.new_scan().project(&[0, 2, 3])?.create_log_scanner(); + /// ``` + pub fn project(mut self, column_indices: &[usize]) -> Result { + if column_indices.is_empty() { + return Err(Error::IllegalArgument( + "Column indices cannot be empty".to_string(), + )); + } + let field_count = self.table_info.row_type().fields().len(); + for &idx in column_indices { + if idx >= field_count { + return Err(Error::IllegalArgument(format!( + "Column index {} out of range (max: {})", + idx, + field_count - 1 + ))); + } + } + self.projected_fields = Some(column_indices.to_vec()); + Ok(self) + } + + /// Projects the scan to only include specified columns by their names. + /// + /// # Arguments + /// * `column_names` - Names of columns to include in the scan + /// + /// # Errors + /// Returns an error if `column_names` is empty or if any column name is not found in the table schema. + /// + /// # Example + /// ``` + /// let scanner = table.new_scan().project_by_name(&["col1", "col3"])?.create_log_scanner(); + /// ``` + pub fn project_by_name(mut self, column_names: &[&str]) -> Result { + if column_names.is_empty() { + return Err(Error::IllegalArgument( + "Column names cannot be empty".to_string(), + )); + } + let row_type = self.table_info.row_type(); + let mut indices = Vec::new(); + + for name in column_names { + let idx = row_type + .fields() + .iter() + .position(|f| f.name() == *name) + .ok_or_else(|| Error::IllegalArgument(format!("Column '{name}' not found")))?; + indices.push(idx); + } + + self.projected_fields = Some(indices); + Ok(self) + } + + pub fn create_log_scanner(self) -> LogScanner { LogScanner::new( &self.table_info, self.metadata.clone(), self.conn.get_connections(), + self.projected_fields, ) } } @@ -72,6 +143,7 @@ impl LogScanner { table_info: &TableInfo, metadata: Arc, connections: Arc, + projected_fields: Option>, ) -> Self { let log_scanner_status = Arc::new(LogScannerStatus::new()); Self { @@ -84,6 +156,7 @@ impl LogScanner { connections.clone(), metadata.clone(), log_scanner_status.clone(), + projected_fields, ), } } @@ -114,6 +187,7 @@ struct LogFetcher { table_info: TableInfo, metadata: Arc, log_scanner_status: Arc, + read_context: ReadContext, } impl LogFetcher { @@ -122,13 +196,27 @@ impl LogFetcher { conns: Arc, metadata: Arc, log_scanner_status: Arc, + projected_fields: Option>, ) -> Self { + let full_arrow_schema = to_arrow_schema(table_info.get_row_type()); + let read_context = Self::create_read_context(full_arrow_schema, projected_fields); LogFetcher { table_path: table_info.table_path.clone(), - conns: conns.clone(), - table_info: table_info.clone(), - metadata: metadata.clone(), - log_scanner_status: log_scanner_status.clone(), + conns, + table_info, + metadata, + log_scanner_status, + read_context, + } + } + + fn create_read_context( + full_arrow_schema: SchemaRef, + projected_fields: Option>, + ) -> ReadContext { + match projected_fields { + None => ReadContext::new(full_arrow_schema), + Some(fields) => ReadContext::with_projection_pushdown(full_arrow_schema, fields), } } @@ -149,7 +237,7 @@ impl LogFetcher { for pb_fetch_log_resp in fetch_response.tables_resp { let table_id = pb_fetch_log_resp.table_id; let fetch_log_for_buckets = pb_fetch_log_resp.buckets_resp; - let arrow_schema = to_arrow_schema(self.table_info.get_row_type()); + for fetch_log_for_bucket in fetch_log_for_buckets { let mut fetch_records = vec![]; let bucket: i32 = fetch_log_for_bucket.bucket_id; @@ -158,8 +246,7 @@ impl LogFetcher { let data = fetch_log_for_bucket.records.unwrap(); for log_record in &mut LogRecordsBatchs::new(&data) { let last_offset = log_record.last_log_offset(); - fetch_records - .extend(log_record.records(ReadContext::new(arrow_schema.clone()))); + fetch_records.extend(log_record.records(&self.read_context)?); self.log_scanner_status .update_offset(&table_bucket, last_offset + 1); } @@ -209,13 +296,19 @@ impl LogFetcher { if ready_for_fetch_count == 0 { HashMap::new() } else { + let (projection_enabled, projected_fields) = + match self.read_context.project_fields_in_order() { + None => (false, vec![]), + Some(fields) => (true, fields.iter().map(|&i| i as i32).collect()), + }; + fetch_log_req_for_buckets .into_iter() .map(|(leader_id, feq_for_buckets)| { let req_for_table = PbFetchLogReqForTable { table_id: table_id.unwrap(), - projection_pushdown_enabled: false, - projected_fields: vec![], + projection_pushdown_enabled: projection_enabled, + projected_fields: projected_fields.clone(), buckets_req: feq_for_buckets, }; diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index 487f50c348..29bfe41a21 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -27,7 +27,12 @@ use arrow::array::{ }; use arrow::{ array::RecordBatch, - ipc::{reader::StreamReader, writer::StreamWriter}, + buffer::Buffer, + ipc::{ + reader::{StreamReader, read_record_batch}, + root_as_message, + writer::StreamWriter, + }, }; use arrow_schema::SchemaRef; use arrow_schema::{DataType as ArrowDataType, Field}; @@ -472,41 +477,84 @@ impl<'a> LogRecordBatch<'a> { LittleEndian::read_i32(&self.data[offset..offset + RECORDS_COUNT_LENGTH]) } - pub fn records(&self, read_context: ReadContext) -> LogRecordIterator { - let count = self.record_count(); - if count == 0 { - return LogRecordIterator::empty(); + pub fn records(&self, read_context: &ReadContext) -> Result { + if self.record_count() == 0 { + return Ok(LogRecordIterator::empty()); } - // get arrow_metadata - let arrow_metadata_bytes = read_context.to_arrow_metadata().unwrap(); - // arrow_batch_data let data = &self.data[RECORDS_OFFSET..]; - // need to combine arrow_metadata_bytes + arrow_batch_data - let cursor = Cursor::new([&arrow_metadata_bytes, data].concat()); - let mut stream_reader = StreamReader::try_new(cursor, None).unwrap(); - - let mut record_batch = None; - if let Some(bath) = stream_reader.next() { - record_batch = Some(bath.unwrap()); - } - - if record_batch.is_none() { - return LogRecordIterator::empty(); - } - - let arrow_reader = ArrowReader::new(Arc::new(record_batch.unwrap())); - LogRecordIterator::Arrow(ArrowLogRecordIterator { - reader: arrow_reader, - base_offset: self.base_log_offset(), - timestamp: self.commit_timestamp(), - row_id: 0, - change_type: ChangeType::AppendOnly, - }) + let record_batch = read_context.record_batch(data)?; + let log_record_iterator = match record_batch { + None => LogRecordIterator::empty(), + Some(record_batch) => { + let arrow_reader = ArrowReader::new(Arc::new(record_batch)); + LogRecordIterator::Arrow(ArrowLogRecordIterator { + reader: arrow_reader, + base_offset: self.base_log_offset(), + timestamp: self.commit_timestamp(), + row_id: 0, + change_type: ChangeType::AppendOnly, + }) + } + }; + Ok(log_record_iterator) } } +/// Parse an Arrow IPC message from a byte slice. +/// +/// Server returns RecordBatch message (without Schema message) in the encapsulated message format. +/// Format: [continuation: 4 bytes (0xFFFFFFFF)][metadata_size: 4 bytes][RecordBatch metadata][body] +/// +/// This format is documented at: +/// https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format +/// +/// # Arguments +/// * `data` - The byte slice containing the IPC message. +/// +/// # Returns +/// Returns `Some((batch_metadata, body_buffer, version))` on success: +/// - `batch_metadata`: The RecordBatch metadata from the IPC message. +/// - `body_buffer`: The buffer containing the record batch body data. +/// - `version`: The Arrow IPC metadata version. +/// +/// Returns `None` if the data is malformed or too short. +fn parse_ipc_message( + data: &[u8], +) -> Option<( + arrow::ipc::RecordBatch<'_>, + Buffer, + arrow::ipc::MetadataVersion, +)> { + const CONTINUATION_MARKER: u32 = 0xFFFFFFFF; + + if data.len() < 8 { + return None; + } + + let continuation = LittleEndian::read_u32(&data[0..4]); + let metadata_size = LittleEndian::read_u32(&data[4..8]) as usize; + + if continuation != CONTINUATION_MARKER { + return None; + } + + if data.len() < 8 + metadata_size { + return None; + } + + let metadata_bytes = &data[8..8 + metadata_size]; + let message = root_as_message(metadata_bytes).ok()?; + let batch_metadata = message.header_as_record_batch()?; + + let body_start = 8 + metadata_size; + let body_data = &data[body_start..]; + let body_buffer = Buffer::from(body_data); + + Some((batch_metadata, body_buffer, message.version())) +} + pub fn to_arrow_schema(fluss_schema: &DataType) -> SchemaRef { match &fluss_schema { DataType::Row(row_type) => { @@ -554,19 +602,140 @@ pub fn to_arrow_type(fluss_type: &DataType) -> ArrowDataType { } } +#[derive(Clone)] pub struct ReadContext { - arrow_schema: SchemaRef, + target_schema: SchemaRef, + + projection: Option, +} + +#[derive(Clone)] +struct Projection { + ordered_schema: SchemaRef, + projected_fields: Vec, + ordered_fields: Vec, + + reordering_indexes: Vec, + reordering_needed: bool, } impl ReadContext { pub fn new(arrow_schema: SchemaRef) -> ReadContext { - ReadContext { arrow_schema } + ReadContext { + target_schema: arrow_schema, + projection: None, + } } - pub fn to_arrow_metadata(&self) -> Result> { - let mut arrow_schema_bytes = vec![]; - let _writer = StreamWriter::try_new(&mut arrow_schema_bytes, &self.arrow_schema)?; - Ok(arrow_schema_bytes) + pub fn with_projection_pushdown( + arrow_schema: SchemaRef, + projected_fields: Vec, + ) -> ReadContext { + let target_schema = Self::project_schema(arrow_schema.clone(), projected_fields.as_slice()); + let mut sorted_fields = projected_fields.clone(); + sorted_fields.sort_unstable(); + + let project = { + if !sorted_fields.eq(&projected_fields) { + // reordering is required + // Calculate reordering indexes to transform from sorted order to user-requested order + let mut reordering_indexes = Vec::with_capacity(projected_fields.len()); + for &original_idx in &projected_fields { + let pos = sorted_fields + .binary_search(&original_idx) + .expect("projection index should exist in sorted list"); + reordering_indexes.push(pos); + } + Projection { + ordered_schema: Self::project_schema( + arrow_schema.clone(), + sorted_fields.as_slice(), + ), + projected_fields, + ordered_fields: sorted_fields, + reordering_indexes, + reordering_needed: true, + } + } else { + Projection { + ordered_schema: Self::project_schema(arrow_schema, projected_fields.as_slice()), + ordered_fields: projected_fields.clone(), + projected_fields, + reordering_indexes: vec![], + reordering_needed: false, + } + } + }; + + ReadContext { + target_schema, + projection: Some(project), + } + } + + pub fn project_schema(schema: SchemaRef, projected_fields: &[usize]) -> SchemaRef { + // todo: handle the exception + SchemaRef::new( + schema + .project(projected_fields) + .expect("can't project schema"), + ) + } + + pub fn project_fields(&self) -> Option<&[usize]> { + self.projection + .as_ref() + .map(|p| p.projected_fields.as_slice()) + } + + pub fn project_fields_in_order(&self) -> Option<&[usize]> { + self.projection + .as_ref() + .map(|p| p.ordered_fields.as_slice()) + } + + pub fn record_batch(&self, data: &[u8]) -> Result> { + let (batch_metadata, body_buffer, version) = match parse_ipc_message(data) { + Some(result) => result, + None => return Ok(None), + }; + + // the record batch from server must be ordered by field pos, + // according to project to decide what arrow schema to use + // to parse the record batch + let resolve_schema = match self.projection { + Some(ref projection) => { + // projection, should use ordered schema by project field pos + projection.ordered_schema.clone() + } + None => { + // no projection, use target output schema + self.target_schema.clone() + } + }; + + let record_batch = read_record_batch( + &body_buffer, + batch_metadata, + resolve_schema, + &std::collections::HashMap::new(), + None, + &version, + )?; + + let record_batch = match &self.projection { + Some(projection) if projection.reordering_needed => { + // Reorder columns if needed (when projection pushdown with non-sorted order) + let reordered_columns: Vec<_> = projection + .reordering_indexes + .iter() + .map(|&idx| record_batch.column(idx).clone()) + .collect(); + RecordBatch::try_new(self.target_schema.clone(), reordered_columns)? + } + _ => record_batch, + }; + Ok(Some(record_batch)) } } diff --git a/fluss-rust/crates/fluss/tests/integration/table.rs b/fluss-rust/crates/fluss/tests/integration/table.rs index aa02724715..e14b852648 100644 --- a/fluss-rust/crates/fluss/tests/integration/table.rs +++ b/fluss-rust/crates/fluss/tests/integration/table.rs @@ -54,6 +54,7 @@ mod table_test { }) .join() .expect("Failed to create cluster"); + // wait for 20 seconds to avoid the error like // CoordinatorEventProcessor is not initialized yet thread::sleep(std::time::Duration::from_secs(20)); @@ -84,14 +85,16 @@ mod table_test { } #[tokio::test] - async fn append_record_batch() { + async fn append_record_batch_and_scan() { let cluster = get_fluss_cluster(); let connection = cluster.get_fluss_connection().await; let admin = connection.get_admin().await.expect("Failed to get admin"); - let table_path = - TablePath::new("fluss".to_string(), "test_append_record_batch".to_string()); + let table_path = TablePath::new( + "fluss".to_string(), + "test_append_record_batch_and_scan".to_string(), + ); let table_descriptor = TableDescriptor::builder() .schema( @@ -101,15 +104,18 @@ mod table_test { .build() .expect("Failed to build schema"), ) + .property("table.log.arrow.compression.type", "NONE") .build() .expect("Failed to build table"); create_table(&admin, &table_path, &table_descriptor).await; - let append_writer = connection + let table = connection .get_table(&table_path) .await - .expect("Failed to get table") + .expect("Failed to get table"); + + let append_writer = table .new_append() .expect("Failed to create append") .create_writer(); @@ -128,6 +134,77 @@ mod table_test { .await .expect("Failed to append batch"); + append_writer.flush().await.expect("Failed to flush"); + + let num_buckets = table.table_info().get_num_buckets(); + let log_scanner = table.new_scan().create_log_scanner(); + for bucket_id in 0..num_buckets { + log_scanner + .subscribe(bucket_id, 0) + .await + .expect("Failed to subscribe"); + } + + let scan_records = log_scanner + .poll(std::time::Duration::from_secs(5)) + .await + .expect("Failed to poll"); + + let mut records: Vec<_> = scan_records.into_iter().collect(); + records.sort_by_key(|r| r.offset()); + + assert_eq!(records.len(), 6, "Should have 6 records"); + for (i, record) in records.iter().enumerate() { + let row = record.row(); + let expected_c1 = (i + 1) as i32; + let expected_c2 = format!("a{}", i + 1); + assert_eq!(row.get_int(0), expected_c1, "c1 mismatch at index {}", i); + assert_eq!(row.get_string(1), expected_c2, "c2 mismatch at index {}", i); + } + + let log_scanner_projected = table + .new_scan() + .project(&[1, 0]) + .expect("Failed to project") + .create_log_scanner(); + for bucket_id in 0..num_buckets { + log_scanner_projected + .subscribe(bucket_id, 0) + .await + .expect("Failed to subscribe"); + } + + let scan_records_projected = log_scanner_projected + .poll(std::time::Duration::from_secs(5)) + .await + .expect("Failed to poll"); + + let mut records_projected: Vec<_> = scan_records_projected.into_iter().collect(); + records_projected.sort_by_key(|r| r.offset()); + + assert_eq!( + records_projected.len(), + 6, + "Should have 6 records with projection" + ); + for (i, record) in records_projected.iter().enumerate() { + let row = record.row(); + let expected_c2 = format!("a{}", i + 1); + let expected_c1 = (i + 1) as i32; + assert_eq!( + row.get_string(0), + expected_c2, + "Projected c2 (first column) mismatch at index {}", + i + ); + assert_eq!( + row.get_int(1), + expected_c1, + "Projected c1 (second column) mismatch at index {}", + i + ); + } + // Create scanner to verify appended records let table = connection .get_table(&table_path) From de9022e7030323e81c10ea6ed4844010dbaa36e1 Mon Sep 17 00:00:00 2001 From: Pavlos-Petros Tournaris Date: Fri, 5 Dec 2025 06:26:24 +0200 Subject: [PATCH 025/287] feat: Implement JsonSerde for all datatypes (#53) --- .../crates/fluss/src/metadata/json_serde.rs | 285 +++++++++++++++--- 1 file changed, 251 insertions(+), 34 deletions(-) diff --git a/fluss-rust/crates/fluss/src/metadata/json_serde.rs b/fluss-rust/crates/fluss/src/metadata/json_serde.rs index 1c7604c98a..447b0f9ff3 100644 --- a/fluss-rust/crates/fluss/src/metadata/json_serde.rs +++ b/fluss-rust/crates/fluss/src/metadata/json_serde.rs @@ -17,7 +17,7 @@ use crate::error::Error::{InvalidTableError, JsonSerdeError}; use crate::error::Result; -use crate::metadata::datatype::{DataType, DataTypes}; +use crate::metadata::datatype::{DataField, DataType, DataTypes}; use crate::metadata::table::{Column, Schema, TableDescriptor}; use serde_json::{Value, json}; use std::collections::HashMap; @@ -58,10 +58,8 @@ impl DataType { const FIELD_NAME_TYPE_NAME: &'static str = "type"; const FIELD_NAME_NULLABLE: &'static str = "nullable"; const FIELD_NAME_LENGTH: &'static str = "length"; - #[allow(dead_code)] const FIELD_NAME_PRECISION: &'static str = "precision"; - #[allow(dead_code)] - const FILED_NAME_SCALE: &'static str = "scale"; + const FIELD_NAME_SCALE: &'static str = "scale"; #[allow(dead_code)] const FIELD_NAME_ELEMENT_TYPE: &'static str = "element_type"; #[allow(dead_code)] @@ -111,21 +109,54 @@ impl JsonSerde for DataType { obj.insert(Self::FIELD_NAME_LENGTH.to_string(), json!(_type.length())); } DataType::Decimal(_type) => { - todo!() + obj.insert( + Self::FIELD_NAME_PRECISION.to_string(), + json!(_type.precision()), + ); + obj.insert(Self::FIELD_NAME_SCALE.to_string(), json!(_type.scale())); } - DataType::Time(_type) => { - todo!() + obj.insert( + Self::FIELD_NAME_PRECISION.to_string(), + json!(_type.precision()), + ); } DataType::Timestamp(_type) => { - todo!() + obj.insert( + Self::FIELD_NAME_PRECISION.to_string(), + json!(_type.precision()), + ); } DataType::TimestampLTz(_type) => { - todo!() + obj.insert( + Self::FIELD_NAME_PRECISION.to_string(), + json!(_type.precision()), + ); + } + DataType::Array(_type) => { + obj.insert( + Self::FIELD_NAME_ELEMENT_TYPE.to_string(), + _type.get_element_type().serialize_json()?, + ); + } + DataType::Map(_type) => { + obj.insert( + Self::FIELD_NAME_KEY_TYPE.to_string(), + _type.key_type().serialize_json()?, + ); + obj.insert( + Self::FIELD_NAME_VALUE_TYPE.to_string(), + _type.value_type().serialize_json()?, + ); + } + DataType::Row(_type) => { + let fields: Vec = _type + .fields() + .iter() + .map(|field| field.serialize_json()) + .collect::>()?; + obj.insert(Self::FIELD_NAME_FIELDS.to_string(), json!(fields)); } - DataType::Array(_type) => todo!(), - DataType::Map(_type) => todo!(), - DataType::Row(_type) => todo!(), } Ok(Value::Object(obj)) } @@ -150,18 +181,112 @@ impl JsonSerde for DataType { "BIGINT" => DataTypes::bigint(), "FLOAT" => DataTypes::float(), "DOUBLE" => DataTypes::double(), - "CHAR" => todo!(), + "CHAR" => { + let length = node + .get(Self::FIELD_NAME_LENGTH) + .and_then(|v| v.as_u64()) + .ok_or_else(|| { + JsonSerdeError(format!( + "Missing required field: {}", + Self::FIELD_NAME_LENGTH + )) + })? as u32; + DataTypes::char(length) + } "STRING" => DataTypes::string(), - "DECIMAL" => todo!(), + "DECIMAL" => { + let precision = node + .get(Self::FIELD_NAME_PRECISION) + .and_then(|v| v.as_u64()) + .ok_or_else(|| { + JsonSerdeError(format!( + "Missing required field: {}", + Self::FIELD_NAME_PRECISION + )) + })? as u32; + let scale = node + .get(Self::FIELD_NAME_SCALE) + .and_then(|v| v.as_u64()) + .unwrap_or(0) as u32; + DataTypes::decimal(precision, scale) + } "DATE" => DataTypes::date(), - "TIME_WITHOUT_TIME_ZONE" => todo!(), // Precision set separately - "TIMESTAMP_WITHOUT_TIME_ZONE" => todo!(), // Precision set separately - "TIMESTAMP_WITH_LOCAL_TIME_ZONE" => todo!(), // Precision set separately + "TIME_WITHOUT_TIME_ZONE" => { + let precision = node + .get(Self::FIELD_NAME_PRECISION) + .and_then(|v| v.as_u64()) + .unwrap_or(0) as u32; + DataTypes::time_with_precision(precision) + } + "TIMESTAMP_WITHOUT_TIME_ZONE" => { + let precision = node + .get(Self::FIELD_NAME_PRECISION) + .and_then(|v| v.as_u64()) + .unwrap_or(6) as u32; + DataTypes::timestamp_with_precision(precision) + } + "TIMESTAMP_WITH_LOCAL_TIME_ZONE" => { + let precision = node + .get(Self::FIELD_NAME_PRECISION) + .and_then(|v| v.as_u64()) + .unwrap_or(6) as u32; + DataTypes::timestamp_ltz_with_precision(precision) + } "BYTES" => DataTypes::bytes(), - "BINARY" => todo!(), - "ARRAY" => todo!(), - "MAP" => todo!(), - "ROW" => todo!(), + "BINARY" => { + let length = node + .get(Self::FIELD_NAME_LENGTH) + .and_then(|v| v.as_u64()) + .unwrap_or(1) as usize; + DataTypes::binary(length) + } + "ARRAY" => { + let element_type_node = + node.get(Self::FIELD_NAME_ELEMENT_TYPE).ok_or_else(|| { + JsonSerdeError(format!( + "Missing required field: {}", + Self::FIELD_NAME_ELEMENT_TYPE + )) + })?; + let element_type = DataType::deserialize_json(element_type_node)?; + DataTypes::array(element_type) + } + "MAP" => { + let key_type_node = node.get(Self::FIELD_NAME_KEY_TYPE).ok_or_else(|| { + JsonSerdeError(format!( + "Missing required field: {}", + Self::FIELD_NAME_KEY_TYPE + )) + })?; + let key_type = DataType::deserialize_json(key_type_node)?; + let value_type_node = node.get(Self::FIELD_NAME_VALUE_TYPE).ok_or_else(|| { + JsonSerdeError(format!( + "Missing required field: {}", + Self::FIELD_NAME_VALUE_TYPE + )) + })?; + let value_type = DataType::deserialize_json(value_type_node)?; + DataTypes::map(key_type, value_type) + } + "ROW" => { + let fields_node = node + .get(Self::FIELD_NAME_FIELDS) + .ok_or_else(|| { + JsonSerdeError(format!( + "Missing required field: {}", + Self::FIELD_NAME_FIELDS + )) + })? + .as_array() + .ok_or_else(|| { + JsonSerdeError(format!("{} must be an array", Self::FIELD_NAME_FIELDS)) + })?; + let mut fields = Vec::with_capacity(fields_node.len()); + for field_node in fields_node { + fields.push(DataField::deserialize_json(field_node)?); + } + DataTypes::row(fields) + } _ => return Err(JsonSerdeError(format!("Unknown type root: {type_root}"))), }; @@ -175,6 +300,51 @@ impl JsonSerde for DataType { } } +impl DataField { + const NAME: &'static str = "name"; + const FIELD_TYPE: &'static str = "field_type"; + const DESCRIPTION: &'static str = "description"; +} + +impl JsonSerde for DataField { + fn serialize_json(&self) -> Result { + let mut obj = serde_json::Map::new(); + + obj.insert(Self::NAME.to_string(), json!(self.name())); + obj.insert( + Self::FIELD_TYPE.to_string(), + self.data_type.serialize_json()?, + ); + + if let Some(description) = &self.description { + obj.insert(Self::DESCRIPTION.to_string(), json!(description)); + } + + Ok(Value::Object(obj)) + } + + fn deserialize_json(node: &Value) -> Result { + let name = node + .get(Self::NAME) + .and_then(|v| v.as_str()) + .ok_or_else(|| JsonSerdeError(format!("Missing required field: {}", Self::NAME)))? + .to_string(); + + let field_type_node = node.get(Self::FIELD_TYPE).ok_or_else(|| { + JsonSerdeError(format!("Missing required field: {}", Self::FIELD_TYPE)) + })?; + + let data_type = DataType::deserialize_json(field_type_node)?; + + let description = node + .get(Self::DESCRIPTION) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()); + + Ok(DataField::new(name, data_type, description)) + } +} + impl Column { const NAME: &'static str = "name"; const DATA_TYPE: &'static str = "data_type"; @@ -203,7 +373,7 @@ impl JsonSerde for Column { let name = node .get(Self::NAME) .and_then(|v| v.as_str()) - .unwrap_or_else(|| panic!("{}", format!("Missing required field: {}", Self::NAME))) + .ok_or_else(|| JsonSerdeError(format!("Missing required field: {}", Self::NAME)))? .to_string(); let data_type_node = node.get(Self::DATA_TYPE).ok_or_else(|| { @@ -263,7 +433,7 @@ impl JsonSerde for Schema { JsonSerdeError(format!("Missing required field: {}", Self::COLUMNS_NAME)) })? .as_array() - .ok_or_else(|| JsonSerdeError(format!("{} should be an array", Self::COLUMNS_NAME)))?; + .ok_or_else(|| JsonSerdeError(format!("{} must be an array", Self::COLUMNS_NAME)))?; let mut columns = Vec::with_capacity(columns_node.len()); for col_node in columns_node { @@ -275,14 +445,16 @@ impl JsonSerde for Schema { if let Some(pk_node) = node.get(Self::PRIMARY_KEY_NAME) { let pk_array = pk_node .as_array() - .ok_or_else(|| InvalidTableError("Primary key is not an array".to_string()))?; + .ok_or_else(|| InvalidTableError("Primary key must be an array".to_string()))?; let mut primary_keys = Vec::with_capacity(pk_array.len()); for name_node in pk_array { primary_keys.push( name_node .as_str() - .ok_or_else(|| InvalidTableError("Primary key is not string".to_string()))? + .ok_or_else(|| { + InvalidTableError("Primary key element must be a string".to_string()) + })? .to_string(), ); } @@ -308,7 +480,7 @@ impl TableDescriptor { fn deserialize_properties(node: &Value) -> Result> { let obj = node .as_object() - .ok_or_else(|| JsonSerdeError("Properties should be an object".to_string()))?; + .ok_or_else(|| JsonSerdeError("Properties must be an object".to_string()))?; let mut properties = HashMap::with_capacity(obj.len()); for (key, value) in obj { @@ -316,7 +488,7 @@ impl TableDescriptor { key.clone(), value .as_str() - .ok_or_else(|| JsonSerdeError("Properties should be an object".to_string()))? + .ok_or_else(|| JsonSerdeError("Property value must be a string".to_string()))? .to_owned(), ); } @@ -383,9 +555,7 @@ impl JsonSerde for TableDescriptor { if let Some(comment_node) = node.get(Self::COMMENT_NAME) { let comment = comment_node .as_str() - .ok_or_else(|| { - JsonSerdeError(format!("{} should be a string", Self::COMMENT_NAME)) - })? + .ok_or_else(|| JsonSerdeError(format!("{} must be a string", Self::COMMENT_NAME)))? .to_owned(); builder = builder.comment(comment.as_str()); } @@ -400,7 +570,7 @@ impl JsonSerde for TableDescriptor { })? .as_array() .ok_or_else(|| { - JsonSerdeError(format!("{} should be an array", Self::PARTITION_KEY_NAME)) + JsonSerdeError(format!("{} must be an array", Self::PARTITION_KEY_NAME)) })?; let mut partition_keys = Vec::with_capacity(partition_node.len()); @@ -409,7 +579,10 @@ impl JsonSerde for TableDescriptor { key_node .as_str() .ok_or_else(|| { - JsonSerdeError(format!("{} should be a string", Self::PARTITION_KEY_NAME)) + JsonSerdeError(format!( + "{} element must be a string", + Self::PARTITION_KEY_NAME + )) })? .to_owned(), ); @@ -420,14 +593,14 @@ impl JsonSerde for TableDescriptor { let mut bucket_keys = vec![]; if let Some(bucket_key_node) = node.get(Self::BUCKET_KEY_NAME) { let bucket_key_node = bucket_key_node.as_array().ok_or_else(|| { - JsonSerdeError(format!("{} should be an array", Self::BUCKET_COUNT_NAME)) + JsonSerdeError(format!("{} must be an array", Self::BUCKET_KEY_NAME)) })?; for key_node in bucket_key_node { bucket_keys.push( key_node .as_str() - .ok_or_else(|| JsonSerdeError("Bucket key should be a string".to_string()))? + .ok_or_else(|| JsonSerdeError("Bucket key must be a string".to_string()))? .to_owned(), ); } @@ -462,3 +635,47 @@ impl JsonSerde for TableDescriptor { builder.build() } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::DataTypes; + + #[test] + fn test_datatype_json_serde() { + let data_types = vec![ + DataTypes::boolean(), + DataTypes::tinyint(), + DataTypes::smallint(), + DataTypes::int().as_non_nullable(), + DataTypes::bigint(), + DataTypes::float(), + DataTypes::double(), + DataTypes::char(10), + DataTypes::string(), + DataTypes::decimal(10, 2), + DataTypes::date(), + DataTypes::time(), + DataTypes::timestamp(), + DataTypes::timestamp_ltz(), + DataTypes::bytes(), + DataTypes::binary(100), + DataTypes::array(DataTypes::int()), + DataTypes::map(DataTypes::string(), DataTypes::int()), + DataTypes::row(vec![ + DataField::new("f1".to_string(), DataTypes::int(), None), + DataField::new( + "f2".to_string(), + DataTypes::string(), + Some("desc".to_string()), + ), + ]), + ]; + + for dt in data_types { + let json = dt.serialize_json().unwrap(); + let deserialized = DataType::deserialize_json(&json).unwrap(); + assert_eq!(dt, deserialized); + } + } +} From f98f49fcf00f2507be6148f24a239b066a2abd3f Mon Sep 17 00:00:00 2001 From: Karan Pradhan <78605930+KaranPradhan266@users.noreply.github.com> Date: Thu, 11 Dec 2025 06:29:30 -0800 Subject: [PATCH 026/287] chore: implemented all get methods for GenericRow (#82) --- fluss-rust/crates/fluss/src/row/datum.rs | 92 ++++++++++++++++++++++++ fluss-rust/crates/fluss/src/row/mod.rs | 40 ++++++----- 2 files changed, 116 insertions(+), 16 deletions(-) diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs index ed33b8badf..3e487039b3 100644 --- a/fluss-rust/crates/fluss/src/row/datum.rs +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -44,6 +44,8 @@ pub enum Datum<'a> { #[display("{0}")] Bool(bool), #[display("{0}")] + Int8(i8), + #[display("{0}")] Int16(i16), #[display("{0}")] Int32(i32), @@ -78,6 +80,13 @@ impl Datum<'_> { _ => panic!("not a string: {self:?}"), } } + + pub fn as_blob(&self) -> &[u8] { + match self { + Self::Blob(blob) => blob.as_ref(), + _ => panic!("not a blob: {self:?}"), + } + } } // ----------- implement from @@ -95,6 +104,20 @@ impl<'a> From for Datum<'a> { } } +impl<'a> From for Datum<'a> { + #[inline] + fn from(i: i8) -> Datum<'a> { + Datum::Int8(i) + } +} + +impl<'a> From for Datum<'a> { + #[inline] + fn from(i: i16) -> Datum<'a> { + Datum::Int16(i) + } +} + impl<'a> From<&'a str> for Datum<'a> { #[inline] fn from(s: &'a str) -> Datum<'a> { @@ -134,6 +157,18 @@ impl TryFrom<&Datum<'_>> for i32 { } } +impl TryFrom<&Datum<'_>> for i16 { + type Error = (); + + #[inline] + fn try_from(from: &Datum) -> std::result::Result { + match from { + Datum::Int16(i) => Ok(*i), + _ => Err(()), + } + } +} + impl TryFrom<&Datum<'_>> for i64 { type Error = (); @@ -146,6 +181,42 @@ impl TryFrom<&Datum<'_>> for i64 { } } +impl TryFrom<&Datum<'_>> for f32 { + type Error = (); + + #[inline] + fn try_from(from: &Datum) -> std::result::Result { + match from { + Datum::Float32(f) => Ok(f.into_inner()), + _ => Err(()), + } + } +} + +impl TryFrom<&Datum<'_>> for f64 { + type Error = (); + + #[inline] + fn try_from(from: &Datum) -> std::result::Result { + match from { + Datum::Float64(f) => Ok(f.into_inner()), + _ => Err(()), + } + } +} + +impl TryFrom<&Datum<'_>> for bool { + type Error = (); + + #[inline] + fn try_from(from: &Datum) -> std::result::Result { + match from { + Datum::Bool(b) => Ok(*b), + _ => Err(()), + } + } +} + impl<'a> TryFrom<&Datum<'a>> for &'a str { type Error = (); @@ -158,6 +229,25 @@ impl<'a> TryFrom<&Datum<'a>> for &'a str { } } +impl TryFrom<&Datum<'_>> for i8 { + type Error = (); + + #[inline] + fn try_from(from: &Datum) -> std::result::Result { + match from { + Datum::Int8(i) => Ok(*i), + _ => Err(()), + } + } +} + +impl<'a> From for Datum<'a> { + #[inline] + fn from(b: bool) -> Datum<'a> { + Datum::Bool(b) + } +} + pub trait ToArrow { fn append_to(&self, builder: &mut dyn ArrayBuilder) -> Result<()>; } @@ -184,6 +274,7 @@ impl Datum<'_> { match self { Datum::Null => { + append_null_to_arrow!(Int8Builder); append_null_to_arrow!(BooleanBuilder); append_null_to_arrow!(Int16Builder); append_null_to_arrow!(Int32Builder); @@ -194,6 +285,7 @@ impl Datum<'_> { append_null_to_arrow!(BinaryBuilder); } Datum::Bool(v) => append_value_to_arrow!(BooleanBuilder, *v), + Datum::Int8(v) => append_value_to_arrow!(Int8Builder, *v), Datum::Int16(v) => append_value_to_arrow!(Int16Builder, *v), Datum::Int32(v) => append_value_to_arrow!(Int32Builder, *v), Datum::Int64(v) => append_value_to_arrow!(Int64Builder, *v), diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index aa2c41159e..a3b8885254 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -85,16 +85,16 @@ impl<'a> InternalRow for GenericRow<'a> { false } - fn get_boolean(&self, _pos: usize) -> bool { - todo!() + fn get_boolean(&self, pos: usize) -> bool { + self.values.get(pos).unwrap().try_into().unwrap() } - fn get_byte(&self, _pos: usize) -> i8 { - todo!() + fn get_byte(&self, pos: usize) -> i8 { + self.values.get(pos).unwrap().try_into().unwrap() } - fn get_short(&self, _pos: usize) -> i16 { - todo!() + fn get_short(&self, pos: usize) -> i16 { + self.values.get(pos).unwrap().try_into().unwrap() } fn get_int(&self, pos: usize) -> i32 { @@ -105,28 +105,36 @@ impl<'a> InternalRow for GenericRow<'a> { self.values.get(_pos).unwrap().try_into().unwrap() } - fn get_float(&self, _pos: usize) -> f32 { - todo!() + fn get_float(&self, pos: usize) -> f32 { + self.values.get(pos).unwrap().try_into().unwrap() } - fn get_double(&self, _pos: usize) -> f64 { - todo!() + fn get_double(&self, pos: usize) -> f64 { + self.values.get(pos).unwrap().try_into().unwrap() } - fn get_char(&self, _pos: usize, _length: usize) -> String { - todo!() + fn get_char(&self, pos: usize, length: usize) -> String { + let value = self.get_string(pos); + if value.len() != length { + panic!( + "Length mismatch for fixed-size char: expected {}, got {}", + length, + value.len() + ); + } + value.to_string() } fn get_string(&self, pos: usize) -> &str { self.values.get(pos).unwrap().try_into().unwrap() } - fn get_binary(&self, _pos: usize, _length: usize) -> Vec { - todo!() + fn get_binary(&self, pos: usize, _length: usize) -> Vec { + self.values.get(pos).unwrap().as_blob().to_vec() } - fn get_bytes(&self, _pos: usize) -> Vec { - todo!() + fn get_bytes(&self, pos: usize) -> Vec { + self.values.get(pos).unwrap().as_blob().to_vec() } } From 1239980006b0155c44179b278fa1432ede2384e7 Mon Sep 17 00:00:00 2001 From: Chase Naples Date: Thu, 11 Dec 2025 20:34:59 -0500 Subject: [PATCH 027/287] chore: fix GenericRow null detection (#86) --- fluss-rust/crates/fluss/src/row/mod.rs | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index a3b8885254..909f3b136f 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -81,8 +81,11 @@ impl<'a> InternalRow for GenericRow<'a> { self.values.len() } - fn is_null_at(&self, _pos: usize) -> bool { - false + fn is_null_at(&self, pos: usize) -> bool { + self.values + .get(pos) + .expect("position out of bounds") + .is_null() } fn get_boolean(&self, pos: usize) -> bool { @@ -153,3 +156,18 @@ impl<'a> GenericRow<'a> { self.values.insert(pos, value.into()); } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn is_null_at_checks_datum_nullity() { + let mut row = GenericRow::new(); + row.set_field(0, Datum::Null); + row.set_field(1, 42_i32); + + assert!(row.is_null_at(0)); + assert!(!row.is_null_at(1)); + } +} From 5ed475e9ffd6da7756115d317f210c1a83990375 Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Fri, 12 Dec 2025 19:58:59 +0800 Subject: [PATCH 028/287] chore: Implement datatype conversion for all types in arrow.rs (#81) --- .../crates/fluss/src/metadata/datatype.rs | 40 ++-- fluss-rust/crates/fluss/src/record/arrow.rs | 212 +++++++++++++++++- 2 files changed, 223 insertions(+), 29 deletions(-) diff --git a/fluss-rust/crates/fluss/src/metadata/datatype.rs b/fluss-rust/crates/fluss/src/metadata/datatype.rs index 4deed2bce0..8ad4f7e569 100644 --- a/fluss-rust/crates/fluss/src/metadata/datatype.rs +++ b/fluss-rust/crates/fluss/src/metadata/datatype.rs @@ -96,25 +96,25 @@ impl DataType { impl Display for DataType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { - DataType::Boolean(v) => write!(f, "{}", v), - DataType::TinyInt(v) => write!(f, "{}", v), - DataType::SmallInt(v) => write!(f, "{}", v), - DataType::Int(v) => write!(f, "{}", v), - DataType::BigInt(v) => write!(f, "{}", v), - DataType::Float(v) => write!(f, "{}", v), - DataType::Double(v) => write!(f, "{}", v), - DataType::Char(v) => write!(f, "{}", v), - DataType::String(v) => write!(f, "{}", v), - DataType::Decimal(v) => write!(f, "{}", v), - DataType::Date(v) => write!(f, "{}", v), - DataType::Time(v) => write!(f, "{}", v), - DataType::Timestamp(v) => write!(f, "{}", v), - DataType::TimestampLTz(v) => write!(f, "{}", v), - DataType::Bytes(v) => write!(f, "{}", v), - DataType::Binary(v) => write!(f, "{}", v), - DataType::Array(v) => write!(f, "{}", v), - DataType::Map(v) => write!(f, "{}", v), - DataType::Row(v) => write!(f, "{}", v), + DataType::Boolean(v) => write!(f, "{v}"), + DataType::TinyInt(v) => write!(f, "{v}"), + DataType::SmallInt(v) => write!(f, "{v}"), + DataType::Int(v) => write!(f, "{v}"), + DataType::BigInt(v) => write!(f, "{v}"), + DataType::Float(v) => write!(f, "{v}"), + DataType::Double(v) => write!(f, "{v}"), + DataType::Char(v) => write!(f, "{v}"), + DataType::String(v) => write!(f, "{v}"), + DataType::Decimal(v) => write!(f, "{v}"), + DataType::Date(v) => write!(f, "{v}"), + DataType::Time(v) => write!(f, "{v}"), + DataType::Timestamp(v) => write!(f, "{v}"), + DataType::TimestampLTz(v) => write!(f, "{v}"), + DataType::Bytes(v) => write!(f, "{v}"), + DataType::Binary(v) => write!(f, "{v}"), + DataType::Array(v) => write!(f, "{v}"), + DataType::Map(v) => write!(f, "{v}"), + DataType::Row(v) => write!(f, "{v}"), } } } @@ -861,7 +861,7 @@ impl Display for RowType { if i > 0 { write!(f, ", ")?; } - write!(f, "{}", field)?; + write!(f, "{field}")?; } write!(f, ">")?; if !self.nullable { diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index 29bfe41a21..e46093dd14 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -589,16 +589,84 @@ pub fn to_arrow_type(fluss_type: &DataType) -> ArrowDataType { DataType::Double(_) => ArrowDataType::Float64, DataType::Char(_) => ArrowDataType::Utf8, DataType::String(_) => ArrowDataType::Utf8, - DataType::Decimal(_) => todo!(), + DataType::Decimal(decimal_type) => ArrowDataType::Decimal128( + decimal_type + .precision() + .try_into() + .expect("precision exceeds u8::MAX"), + decimal_type + .scale() + .try_into() + .expect("scale exceeds i8::MAX"), + ), DataType::Date(_) => ArrowDataType::Date32, - DataType::Time(_) => todo!(), - DataType::Timestamp(_) => todo!(), - DataType::TimestampLTz(_) => todo!(), - DataType::Bytes(_) => todo!(), - DataType::Binary(_) => todo!(), - DataType::Array(_data_type) => todo!(), - DataType::Map(_data_type) => todo!(), - DataType::Row(_data_fields) => todo!(), + DataType::Time(time_type) => match time_type.precision() { + 0 => ArrowDataType::Time32(arrow_schema::TimeUnit::Second), + 1..=3 => ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond), + 4..=6 => ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond), + 7..=9 => ArrowDataType::Time64(arrow_schema::TimeUnit::Nanosecond), + // This arm should never be reached due to validation in TimeType. + invalid => panic!("Invalid precision value for TimeType: {invalid}"), + }, + DataType::Timestamp(timestamp_type) => match timestamp_type.precision() { + 0 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None), + 1..=3 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None), + 4..=6 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None), + 7..=9 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None), + // This arm should never be reached due to validation in Timestamp. + invalid => panic!("Invalid precision value for TimestampType: {invalid}"), + }, + DataType::TimestampLTz(timestamp_ltz_type) => match timestamp_ltz_type.precision() { + 0 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None), + 1..=3 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None), + 4..=6 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None), + 7..=9 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None), + // This arm should never be reached due to validation in TimestampLTz. + invalid => panic!("Invalid precision value for TimestampLTzType: {invalid}"), + }, + DataType::Bytes(_) => ArrowDataType::Binary, + DataType::Binary(binary_type) => ArrowDataType::FixedSizeBinary( + binary_type + .length() + .try_into() + .expect("length exceeds i32::MAX"), + ), + DataType::Array(array_type) => ArrowDataType::List( + Field::new_list_field( + to_arrow_type(array_type.get_element_type()), + fluss_type.is_nullable(), + ) + .into(), + ), + DataType::Map(map_type) => { + let key_type = to_arrow_type(map_type.key_type()); + let value_type = to_arrow_type(map_type.value_type()); + let entry_fields = vec![ + Field::new("key", key_type, map_type.key_type().is_nullable()), + Field::new("value", value_type, map_type.value_type().is_nullable()), + ]; + ArrowDataType::Map( + Arc::new(Field::new( + "entries", + ArrowDataType::Struct(arrow_schema::Fields::from(entry_fields)), + fluss_type.is_nullable(), + )), + false, + ) + } + DataType::Row(row_type) => ArrowDataType::Struct(arrow_schema::Fields::from( + row_type + .fields() + .iter() + .map(|f| { + Field::new( + f.name(), + to_arrow_type(f.data_type()), + f.data_type().is_nullable(), + ) + }) + .collect::>(), + )), } } @@ -820,3 +888,129 @@ impl ArrowReader { } } pub struct MyVec(pub StreamReader); + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::DataTypes; + + #[test] + fn test_to_array_type() { + assert_eq!(to_arrow_type(&DataTypes::boolean()), ArrowDataType::Boolean); + assert_eq!(to_arrow_type(&DataTypes::tinyint()), ArrowDataType::Int8); + assert_eq!(to_arrow_type(&DataTypes::smallint()), ArrowDataType::Int16); + assert_eq!(to_arrow_type(&DataTypes::bigint()), ArrowDataType::Int64); + assert_eq!(to_arrow_type(&DataTypes::int()), ArrowDataType::Int32); + assert_eq!(to_arrow_type(&DataTypes::float()), ArrowDataType::Float32); + assert_eq!(to_arrow_type(&DataTypes::double()), ArrowDataType::Float64); + assert_eq!(to_arrow_type(&DataTypes::char(16)), ArrowDataType::Utf8); + assert_eq!(to_arrow_type(&DataTypes::string()), ArrowDataType::Utf8); + assert_eq!( + to_arrow_type(&DataTypes::decimal(10, 2)), + ArrowDataType::Decimal128(10, 2) + ); + assert_eq!(to_arrow_type(&DataTypes::date()), ArrowDataType::Date32); + assert_eq!( + to_arrow_type(&DataTypes::time()), + ArrowDataType::Time32(arrow_schema::TimeUnit::Second) + ); + assert_eq!( + to_arrow_type(&DataTypes::time_with_precision(3)), + ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond) + ); + assert_eq!( + to_arrow_type(&DataTypes::time_with_precision(6)), + ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond) + ); + assert_eq!( + to_arrow_type(&DataTypes::time_with_precision(9)), + ArrowDataType::Time64(arrow_schema::TimeUnit::Nanosecond) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_with_precision(0)), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_with_precision(3)), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_with_precision(6)), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_with_precision(9)), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(0)), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(3)), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(6)), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(9)), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None) + ); + assert_eq!(to_arrow_type(&DataTypes::bytes()), ArrowDataType::Binary); + assert_eq!( + to_arrow_type(&DataTypes::binary(16)), + ArrowDataType::FixedSizeBinary(16) + ); + + assert_eq!( + to_arrow_type(&DataTypes::array(DataTypes::int())), + ArrowDataType::List(Field::new_list_field(ArrowDataType::Int32, true).into()) + ); + + assert_eq!( + to_arrow_type(&DataTypes::map(DataTypes::string(), DataTypes::int())), + ArrowDataType::Map( + Arc::new(Field::new( + "entries", + ArrowDataType::Struct(arrow_schema::Fields::from(vec![ + Field::new("key", ArrowDataType::Utf8, true), + Field::new("value", ArrowDataType::Int32, true), + ])), + true, + )), + false, + ) + ); + + assert_eq!( + to_arrow_type(&DataTypes::row(vec![ + DataTypes::field("f1".to_string(), DataTypes::int()), + DataTypes::field("f2".to_string(), DataTypes::string()), + ])), + ArrowDataType::Struct(arrow_schema::Fields::from(vec![ + Field::new("f1", ArrowDataType::Int32, true), + Field::new("f2", ArrowDataType::Utf8, true), + ])) + ); + } + + #[test] + #[should_panic(expected = "Invalid precision value for TimeType: 10")] + fn test_time_invalid_precision() { + to_arrow_type(&DataTypes::time_with_precision(10)); + } + + #[test] + #[should_panic(expected = "Invalid precision value for TimestampType: 10")] + fn test_timestamp_invalid_precision() { + to_arrow_type(&DataTypes::timestamp_with_precision(10)); + } + + #[test] + #[should_panic(expected = "Invalid precision value for TimestampLTzType: 10")] + fn test_timestamp_ltz_invalid_precision() { + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(10)); + } +} From c3f110436b0be4dc107981571cef45c65425dd57 Mon Sep 17 00:00:00 2001 From: AlexZhao Date: Sat, 13 Dec 2025 15:14:05 +0800 Subject: [PATCH 029/287] feat: introduce cpp bindings (#83) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --------- Co-authored-by: 赵海源 --- fluss-rust/Cargo.toml | 2 +- fluss-rust/bindings/cpp/.clang-format | 21 + fluss-rust/bindings/cpp/.gitignore | 7 + fluss-rust/bindings/cpp/CMakeLists.txt | 107 ++++ fluss-rust/bindings/cpp/Cargo.toml | 36 ++ fluss-rust/bindings/cpp/build.rs | 24 + fluss-rust/bindings/cpp/examples/example.cpp | 166 ++++++ fluss-rust/bindings/cpp/include/fluss.hpp | 461 +++++++++++++++ fluss-rust/bindings/cpp/src/admin.cpp | 101 ++++ fluss-rust/bindings/cpp/src/connection.cpp | 95 ++++ fluss-rust/bindings/cpp/src/ffi_converter.hpp | 256 +++++++++ fluss-rust/bindings/cpp/src/lib.rs | 523 ++++++++++++++++++ fluss-rust/bindings/cpp/src/table.cpp | 228 ++++++++ fluss-rust/bindings/cpp/src/types.rs | 485 ++++++++++++++++ fluss-rust/crates/fluss/src/config.rs | 14 +- 15 files changed, 2524 insertions(+), 2 deletions(-) create mode 100644 fluss-rust/bindings/cpp/.clang-format create mode 100644 fluss-rust/bindings/cpp/.gitignore create mode 100644 fluss-rust/bindings/cpp/CMakeLists.txt create mode 100644 fluss-rust/bindings/cpp/Cargo.toml create mode 100644 fluss-rust/bindings/cpp/build.rs create mode 100644 fluss-rust/bindings/cpp/examples/example.cpp create mode 100644 fluss-rust/bindings/cpp/include/fluss.hpp create mode 100644 fluss-rust/bindings/cpp/src/admin.cpp create mode 100644 fluss-rust/bindings/cpp/src/connection.cpp create mode 100644 fluss-rust/bindings/cpp/src/ffi_converter.hpp create mode 100644 fluss-rust/bindings/cpp/src/lib.rs create mode 100644 fluss-rust/bindings/cpp/src/table.cpp create mode 100644 fluss-rust/bindings/cpp/src/types.rs diff --git a/fluss-rust/Cargo.toml b/fluss-rust/Cargo.toml index 15bcb796fd..b4ac03b7be 100644 --- a/fluss-rust/Cargo.toml +++ b/fluss-rust/Cargo.toml @@ -28,7 +28,7 @@ rust-version = "1.85" [workspace] resolver = "2" -members = ["crates/fluss", "crates/examples", "bindings/python"] +members = ["crates/fluss", "crates/examples", "bindings/python", "bindings/cpp"] [workspace.dependencies] fluss = { version = "0.1.0", path = "./crates/fluss" } diff --git a/fluss-rust/bindings/cpp/.clang-format b/fluss-rust/bindings/cpp/.clang-format new file mode 100644 index 0000000000..1c31900ec4 --- /dev/null +++ b/fluss-rust/bindings/cpp/.clang-format @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +--- +BasedOnStyle: Google +ColumnLimit: 100 +IndentWidth: 4 diff --git a/fluss-rust/bindings/cpp/.gitignore b/fluss-rust/bindings/cpp/.gitignore new file mode 100644 index 0000000000..6836e70c06 --- /dev/null +++ b/fluss-rust/bindings/cpp/.gitignore @@ -0,0 +1,7 @@ +build/ +cmake-build-*/ +.idea/ +*.o +*.a +*.so +*.dylib diff --git a/fluss-rust/bindings/cpp/CMakeLists.txt b/fluss-rust/bindings/cpp/CMakeLists.txt new file mode 100644 index 0000000000..629f3f076d --- /dev/null +++ b/fluss-rust/bindings/cpp/CMakeLists.txt @@ -0,0 +1,107 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +cmake_minimum_required(VERSION 3.22) + +if (POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +project(fluss-cpp LANGUAGES CXX) + +include(FetchContent) +set(FLUSS_GOOGLETEST_VERSION 1.15.2 CACHE STRING "version of GoogleTest") +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +find_package(Threads REQUIRED) + +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Debug) +endif() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +option(FLUSS_ENABLE_ADDRESS_SANITIZER "Enable address sanitizer" OFF) +option(FLUSS_ENABLE_TESTING "Enable building test binary for fluss" OFF) +option(FLUSS_DEV "Enable dev mode" OFF) + +if (FLUSS_DEV) + set(FLUSS_ENABLE_ADDRESS_SANITIZER ON) + set(FLUSS_ENABLE_TESTING ON) +endif() + +# Get cargo target dir +execute_process(COMMAND cargo locate-project --workspace --message-format plain + OUTPUT_VARIABLE CARGO_TARGET_DIR + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) +string(REGEX REPLACE "/Cargo.toml\n$" "/target" CARGO_TARGET_DIR "${CARGO_TARGET_DIR}") + +set(CARGO_MANIFEST ${PROJECT_SOURCE_DIR}/Cargo.toml) +set(RUST_SOURCE_FILE ${PROJECT_SOURCE_DIR}/src/lib.rs) +set(RUST_BRIDGE_CPP ${CARGO_TARGET_DIR}/cxxbridge/fluss-cpp/src/lib.rs.cc) +set(RUST_HEADER_FILE ${CARGO_TARGET_DIR}/cxxbridge/fluss-cpp/src/lib.rs.h) + +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + set(RUST_LIB ${CARGO_TARGET_DIR}/debug/${CMAKE_STATIC_LIBRARY_PREFIX}fluss_cpp${CMAKE_STATIC_LIBRARY_SUFFIX}) +else() + set(RUST_LIB ${CARGO_TARGET_DIR}/release/${CMAKE_STATIC_LIBRARY_PREFIX}fluss_cpp${CMAKE_STATIC_LIBRARY_SUFFIX}) +endif() + +set(CPP_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/src + ${CARGO_TARGET_DIR}/cxxbridge + ${CARGO_TARGET_DIR}/cxxbridge/fluss-cpp/src) + +file(GLOB CPP_SOURCE_FILE "src/*.cpp") +file(GLOB CPP_HEADER_FILE "include/*.hpp") + +if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CARGO_BUILD_FLAGS "--release") +endif() + +add_custom_target(cargo_build + COMMAND cargo build --manifest-path ${CARGO_MANIFEST} ${CARGO_BUILD_FLAGS} + BYPRODUCTS ${RUST_BRIDGE_CPP} ${RUST_LIB} ${RUST_HEADER_FILE} + DEPENDS ${RUST_SOURCE_FILE} + USES_TERMINAL + COMMENT "Running cargo..." +) + +add_library(fluss_cpp STATIC ${CPP_SOURCE_FILE} ${RUST_BRIDGE_CPP}) +target_sources(fluss_cpp PUBLIC ${CPP_HEADER_FILE}) +target_sources(fluss_cpp PRIVATE ${RUST_HEADER_FILE}) +target_include_directories(fluss_cpp PUBLIC ${CPP_INCLUDE_DIR}) +target_link_libraries(fluss_cpp PUBLIC ${RUST_LIB}) +target_link_libraries(fluss_cpp PRIVATE ${CMAKE_DL_LIBS} Threads::Threads) +if(APPLE) + target_link_libraries(fluss_cpp PUBLIC "-framework CoreFoundation" "-framework Security") +endif() + +add_executable(fluss_cpp_example examples/example.cpp) +target_link_libraries(fluss_cpp_example fluss_cpp) +target_include_directories(fluss_cpp_example PUBLIC ${CPP_INCLUDE_DIR}) + +set_target_properties(fluss_cpp + PROPERTIES ADDITIONAL_CLEAN_FILES ${CARGO_TARGET_DIR} +) +add_dependencies(fluss_cpp cargo_build) + +if (FLUSS_ENABLE_ADDRESS_SANITIZER) + target_compile_options(fluss_cpp PRIVATE -fsanitize=leak,address,undefined -fno-omit-frame-pointer -fno-common -O1) + target_link_options(fluss_cpp PRIVATE -fsanitize=leak,address,undefined) +endif() \ No newline at end of file diff --git a/fluss-rust/bindings/cpp/Cargo.toml b/fluss-rust/bindings/cpp/Cargo.toml new file mode 100644 index 0000000000..2d3d913550 --- /dev/null +++ b/fluss-rust/bindings/cpp/Cargo.toml @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "fluss-cpp" +version = "0.1.0" +edition.workspace = true +rust-version.workspace = true +publish = false + +[lib] +crate-type = ["staticlib"] + +[dependencies] +anyhow = "1.0" +arrow = { workspace = true } +cxx = "1.0" +fluss = { path = "../../crates/fluss" } +tokio = { version = "1.27", features = ["rt-multi-thread", "macros"] } + +[build-dependencies] +cxx-build = "1.0" diff --git a/fluss-rust/bindings/cpp/build.rs b/fluss-rust/bindings/cpp/build.rs new file mode 100644 index 0000000000..ec75e24aeb --- /dev/null +++ b/fluss-rust/bindings/cpp/build.rs @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +fn main() { + cxx_build::bridge("src/lib.rs") + .std("c++17") + .compile("fluss-cpp-bridge"); + + println!("cargo:rerun-if-changed=src/lib.rs"); +} diff --git a/fluss-rust/bindings/cpp/examples/example.cpp b/fluss-rust/bindings/cpp/examples/example.cpp new file mode 100644 index 0000000000..5146f28216 --- /dev/null +++ b/fluss-rust/bindings/cpp/examples/example.cpp @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "fluss.hpp" + +#include +#include + +static void check(const char* step, const fluss::Result& r) { + if (!r.Ok()) { + std::cerr << step << " failed: code=" << r.error_code + << " msg=" << r.error_message << std::endl; + std::exit(1); + } +} + +int main() { + // 1) Connect + fluss::Connection conn; + check("connect", fluss::Connection::Connect("127.0.0.1:9123", conn)); + + // 2) Admin + fluss::Admin admin; + check("get_admin", conn.GetAdmin(admin)); + + // 3) Schema & descriptor + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int) + .AddColumn("name", fluss::DataType::String) + .AddColumn("score", fluss::DataType::Float) + .AddColumn("age", fluss::DataType::Int) + .Build(); + + auto descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(1) + .SetProperty("table.log.arrow.compression.type", "NONE") + .SetComment("cpp example table") + .Build(); + + fluss::TablePath table_path("fluss", "sample_table_cpp_v1"); + // ignore_if_exists=true to allow re-run + check("create_table", admin.CreateTable(table_path, descriptor, true)); + + // 4) Get table + fluss::Table table; + check("get_table", conn.GetTable(table_path, table)); + + // 5) Writer + fluss::AppendWriter writer; + check("new_append_writer", table.NewAppendWriter(writer)); + + struct RowData { + int id; + const char* name; + float score; + int age; + }; + + std::vector rows = { + {1, "Alice", 95.2f, 25}, + {2, "Bob", 87.2f, 30}, + {3, "Charlie", 92.1f, 35}, + }; + + for (const auto& r : rows) { + fluss::GenericRow row; + row.SetInt32(0, r.id); + row.SetString(1, r.name); + row.SetFloat32(2, r.score); + row.SetInt32(3, r.age); + check("append", writer.Append(row)); + } + check("flush", writer.Flush()); + std::cout << "Wrote " << rows.size() << " rows" << std::endl; + + // 6) Scan + fluss::LogScanner scanner; + check("new_log_scanner", table.NewLogScanner(scanner)); + + auto info = table.GetTableInfo(); + int buckets = info.num_buckets; + for (int b = 0; b < buckets; ++b) { + check("subscribe", scanner.Subscribe(b, 0)); + } + + fluss::ScanRecords records; + check("poll", scanner.Poll(5000, records)); + + std::cout << "Scanned records: " << records.records.size() << std::endl; + for (const auto& rec : records.records) { + std::cout << " offset=" << rec.offset << " id=" << rec.row.fields[0].i32_val + << " name=" << rec.row.fields[1].string_val + << " score=" << rec.row.fields[2].f32_val << " age=" << rec.row.fields[3].i32_val + << " ts=" << rec.timestamp << std::endl; + } + + // 7) Project only id (0) and name (1) columns + std::vector projected_columns = {0, 1}; + fluss::LogScanner projected_scanner; + check("new_log_scanner_with_projection", + table.NewLogScannerWithProjection(projected_columns, projected_scanner)); + + for (int b = 0; b < buckets; ++b) { + check("subscribe_projected", projected_scanner.Subscribe(b, 0)); + } + + fluss::ScanRecords projected_records; + check("poll_projected", projected_scanner.Poll(5000, projected_records)); + + std::cout << "Projected records: " << projected_records.records.size() << std::endl; + + bool projection_verified = true; + for (size_t i = 0; i < projected_records.records.size(); ++i) { + const auto& rec = projected_records.records[i]; + const auto& row = rec.row; + + if (row.fields.size() != projected_columns.size()) { + std::cerr << "ERROR: Record " << i << " has " << row.fields.size() + << " fields, expected " << projected_columns.size() << std::endl; + projection_verified = false; + continue; + } + + // Verify field types match expected columns + // Column 0 (id) should be Int32, Column 1 (name) should be String + if (row.fields[0].type != fluss::DatumType::Int32) { + std::cerr << "ERROR: Record " << i << " field 0 type mismatch, expected Int32" << std::endl; + projection_verified = false; + } + if (row.fields[1].type != fluss::DatumType::String) { + std::cerr << "ERROR: Record " << i << " field 1 type mismatch, expected String" << std::endl; + projection_verified = false; + } + + // Print projected data + if (row.fields[0].type == fluss::DatumType::Int32 && + row.fields[1].type == fluss::DatumType::String) { + std::cout << " Record " << i << ": id=" << row.fields[0].i32_val + << ", name=" << row.fields[1].string_val << std::endl; + } + } + + if (projection_verified) { + std::cout << "Column pruning verification passed!" << std::endl; + } else { + std::cerr << "Column pruning verification failed!" << std::endl; + std::exit(1); + } + + return 0; +} diff --git a/fluss-rust/bindings/cpp/include/fluss.hpp b/fluss-rust/bindings/cpp/include/fluss.hpp new file mode 100644 index 0000000000..002f80694a --- /dev/null +++ b/fluss-rust/bindings/cpp/include/fluss.hpp @@ -0,0 +1,461 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace fluss { + +namespace ffi { + struct Connection; + struct Admin; + struct Table; + struct AppendWriter; + struct LogScanner; +} // namespace ffi + +enum class DataType { + Boolean = 1, + TinyInt = 2, + SmallInt = 3, + Int = 4, + BigInt = 5, + Float = 6, + Double = 7, + String = 8, + Bytes = 9, + Date = 10, + Time = 11, + Timestamp = 12, + TimestampLtz = 13, +}; + +enum class DatumType { + Null = 0, + Bool = 1, + Int32 = 2, + Int64 = 3, + Float32 = 4, + Float64 = 5, + String = 6, + Bytes = 7, +}; + +struct Result { + int32_t error_code{0}; + std::string error_message; + + bool Ok() const { return error_code == 0; } +}; + +struct TablePath { + std::string database_name; + std::string table_name; + + TablePath() = default; + TablePath(std::string db, std::string tbl) + : database_name(std::move(db)), table_name(std::move(tbl)) {} + + std::string ToString() const { return database_name + "." + table_name; } +}; + +struct Column { + std::string name; + DataType data_type; + std::string comment; +}; + +struct Schema { + std::vector columns; + std::vector primary_keys; + + class Builder { + public: + Builder& AddColumn(std::string name, DataType type, + std::string comment = "") { + columns_.push_back({std::move(name), type, std::move(comment)}); + return *this; + } + + Builder& SetPrimaryKeys(std::vector keys) { + primary_keys_ = std::move(keys); + return *this; + } + + Schema Build() { + return Schema{std::move(columns_), std::move(primary_keys_)}; + } + + private: + std::vector columns_; + std::vector primary_keys_; + }; + + static Builder NewBuilder() { return Builder(); } +}; + +struct TableDescriptor { + Schema schema; + std::vector partition_keys; + int32_t bucket_count{0}; + std::vector bucket_keys; + std::unordered_map properties; + std::string comment; + + class Builder { + public: + Builder& SetSchema(Schema s) { + schema_ = std::move(s); + return *this; + } + + Builder& SetPartitionKeys(std::vector keys) { + partition_keys_ = std::move(keys); + return *this; + } + + Builder& SetBucketCount(int32_t count) { + bucket_count_ = count; + return *this; + } + + Builder& SetBucketKeys(std::vector keys) { + bucket_keys_ = std::move(keys); + return *this; + } + + Builder& SetProperty(std::string key, std::string value) { + properties_[std::move(key)] = std::move(value); + return *this; + } + + Builder& SetComment(std::string comment) { + comment_ = std::move(comment); + return *this; + } + + TableDescriptor Build() { + return TableDescriptor{std::move(schema_), + std::move(partition_keys_), + bucket_count_, + std::move(bucket_keys_), + std::move(properties_), + std::move(comment_)}; + } + + private: + Schema schema_; + std::vector partition_keys_; + int32_t bucket_count_{0}; + std::vector bucket_keys_; + std::unordered_map properties_; + std::string comment_; + }; + + static Builder NewBuilder() { return Builder(); } +}; + +struct TableInfo { + int64_t table_id; + int32_t schema_id; + TablePath table_path; + int64_t created_time; + int64_t modified_time; + std::vector primary_keys; + std::vector bucket_keys; + std::vector partition_keys; + int32_t num_buckets; + bool has_primary_key; + bool is_partitioned; + std::unordered_map properties; + std::string comment; + Schema schema; +}; + +struct Datum { + DatumType type{DatumType::Null}; + bool bool_val{false}; + int32_t i32_val{0}; + int64_t i64_val{0}; + float f32_val{0.0F}; + double f64_val{0.0}; + std::string string_val; + std::vector bytes_val; + + static Datum Null() { return {}; } + static Datum Bool(bool v) { + Datum d; + d.type = DatumType::Bool; + d.bool_val = v; + return d; + } + static Datum Int32(int32_t v) { + Datum d; + d.type = DatumType::Int32; + d.i32_val = v; + return d; + } + static Datum Int64(int64_t v) { + Datum d; + d.type = DatumType::Int64; + d.i64_val = v; + return d; + } + static Datum Float32(float v) { + Datum d; + d.type = DatumType::Float32; + d.f32_val = v; + return d; + } + static Datum Float64(double v) { + Datum d; + d.type = DatumType::Float64; + d.f64_val = v; + return d; + } + static Datum String(std::string v) { + Datum d; + d.type = DatumType::String; + d.string_val = std::move(v); + return d; + } + static Datum Bytes(std::vector v) { + Datum d; + d.type = DatumType::Bytes; + d.bytes_val = std::move(v); + return d; + } +}; + +struct GenericRow { + std::vector fields; + + void SetNull(size_t idx) { + EnsureSize(idx); + fields[idx] = Datum::Null(); + } + + void SetBool(size_t idx, bool v) { + EnsureSize(idx); + fields[idx] = Datum::Bool(v); + } + + void SetInt32(size_t idx, int32_t v) { + EnsureSize(idx); + fields[idx] = Datum::Int32(v); + } + + void SetInt64(size_t idx, int64_t v) { + EnsureSize(idx); + fields[idx] = Datum::Int64(v); + } + + void SetFloat32(size_t idx, float v) { + EnsureSize(idx); + fields[idx] = Datum::Float32(v); + } + + void SetFloat64(size_t idx, double v) { + EnsureSize(idx); + fields[idx] = Datum::Float64(v); + } + + void SetString(size_t idx, std::string v) { + EnsureSize(idx); + fields[idx] = Datum::String(std::move(v)); + } + + void SetBytes(size_t idx, std::vector v) { + EnsureSize(idx); + fields[idx] = Datum::Bytes(std::move(v)); + } + +private: + void EnsureSize(size_t idx) { + if (fields.size() <= idx) { + fields.resize(idx + 1); + } + } +}; + +struct ScanRecord { + int64_t offset; + int64_t timestamp; + GenericRow row; +}; + +struct ScanRecords { + std::vector records; + + size_t Size() const { return records.size(); } + bool Empty() const { return records.empty(); } + const ScanRecord& operator[](size_t idx) const { return records[idx]; } + + auto begin() const { return records.begin(); } + auto end() const { return records.end(); } +}; + +struct BucketOffset { + int64_t table_id; + int64_t partition_id; + int32_t bucket_id; + int64_t offset; +}; + +struct LakeSnapshot { + int64_t snapshot_id; + std::vector bucket_offsets; +}; + +class AppendWriter; +class LogScanner; +class Admin; +class Table; + +class Connection { +public: + Connection() noexcept; + ~Connection() noexcept; + + Connection(const Connection&) = delete; + Connection& operator=(const Connection&) = delete; + Connection(Connection&& other) noexcept; + Connection& operator=(Connection&& other) noexcept; + + static Result Connect(const std::string& bootstrap_server, Connection& out); + + bool Available() const; + + Result GetAdmin(Admin& out); + Result GetTable(const TablePath& table_path, Table& out); + +private: + void Destroy() noexcept; + ffi::Connection* conn_{nullptr}; +}; + +class Admin { +public: + Admin() noexcept; + ~Admin() noexcept; + + Admin(const Admin&) = delete; + Admin& operator=(const Admin&) = delete; + Admin(Admin&& other) noexcept; + Admin& operator=(Admin&& other) noexcept; + + bool Available() const; + + Result CreateTable(const TablePath& table_path, + const TableDescriptor& descriptor, + bool ignore_if_exists = false); + + Result GetTable(const TablePath& table_path, TableInfo& out); + + Result GetLatestLakeSnapshot(const TablePath& table_path, LakeSnapshot& out); + +private: + friend class Connection; + Admin(ffi::Admin* admin) noexcept; + + void Destroy() noexcept; + ffi::Admin* admin_{nullptr}; +}; + +class Table { +public: + Table() noexcept; + ~Table() noexcept; + + Table(const Table&) = delete; + Table& operator=(const Table&) = delete; + Table(Table&& other) noexcept; + Table& operator=(Table&& other) noexcept; + + bool Available() const; + + Result NewAppendWriter(AppendWriter& out); + Result NewLogScanner(LogScanner& out); + Result NewLogScannerWithProjection(const std::vector& column_indices, LogScanner& out); + + TableInfo GetTableInfo() const; + TablePath GetTablePath() const; + bool HasPrimaryKey() const; + +private: + friend class Connection; + Table(ffi::Table* table) noexcept; + + void Destroy() noexcept; + ffi::Table* table_{nullptr}; +}; + +class AppendWriter { +public: + AppendWriter() noexcept; + ~AppendWriter() noexcept; + + AppendWriter(const AppendWriter&) = delete; + AppendWriter& operator=(const AppendWriter&) = delete; + AppendWriter(AppendWriter&& other) noexcept; + AppendWriter& operator=(AppendWriter&& other) noexcept; + + bool Available() const; + + Result Append(const GenericRow& row); + Result Flush(); + +private: + friend class Table; + AppendWriter(ffi::AppendWriter* writer) noexcept; + + void Destroy() noexcept; + ffi::AppendWriter* writer_{nullptr}; +}; + +class LogScanner { +public: + LogScanner() noexcept; + ~LogScanner() noexcept; + + LogScanner(const LogScanner&) = delete; + LogScanner& operator=(const LogScanner&) = delete; + LogScanner(LogScanner&& other) noexcept; + LogScanner& operator=(LogScanner&& other) noexcept; + + bool Available() const; + + Result Subscribe(int32_t bucket_id, int64_t start_offset); + Result Poll(int64_t timeout_ms, ScanRecords& out); + +private: + friend class Table; + LogScanner(ffi::LogScanner* scanner) noexcept; + + void Destroy() noexcept; + ffi::LogScanner* scanner_{nullptr}; +}; + +} // namespace fluss diff --git a/fluss-rust/bindings/cpp/src/admin.cpp b/fluss-rust/bindings/cpp/src/admin.cpp new file mode 100644 index 0000000000..f6997a640a --- /dev/null +++ b/fluss-rust/bindings/cpp/src/admin.cpp @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "fluss.hpp" +#include "lib.rs.h" +#include "ffi_converter.hpp" +#include "rust/cxx.h" + +namespace fluss { + +Admin::Admin() noexcept = default; + +Admin::Admin(ffi::Admin* admin) noexcept : admin_(admin) {} + +Admin::~Admin() noexcept { Destroy(); } + +void Admin::Destroy() noexcept { + if (admin_) { + ffi::delete_admin(admin_); + admin_ = nullptr; + } +} + +Admin::Admin(Admin&& other) noexcept : admin_(other.admin_) { + other.admin_ = nullptr; +} + +Admin& Admin::operator=(Admin&& other) noexcept { + if (this != &other) { + Destroy(); + admin_ = other.admin_; + other.admin_ = nullptr; + } + return *this; +} + +bool Admin::Available() const { return admin_ != nullptr; } + +Result Admin::CreateTable(const TablePath& table_path, + const TableDescriptor& descriptor, + bool ignore_if_exists) { + if (!Available()) { + return utils::make_error(1, "Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + auto ffi_desc = utils::to_ffi_table_descriptor(descriptor); + + auto ffi_result = admin_->create_table(ffi_path, ffi_desc, ignore_if_exists); + return utils::from_ffi_result(ffi_result); +} + +Result Admin::GetTable(const TablePath& table_path, TableInfo& out) { + if (!Available()) { + return utils::make_error(1, "Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + auto ffi_result = admin_->get_table_info(ffi_path); + + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out = utils::from_ffi_table_info(ffi_result.table_info); + } + + return result; +} + +Result Admin::GetLatestLakeSnapshot(const TablePath& table_path, LakeSnapshot& out) { + if (!Available()) { + return utils::make_error(1, "Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + auto ffi_result = admin_->get_latest_lake_snapshot(ffi_path); + + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out = utils::from_ffi_lake_snapshot(ffi_result.lake_snapshot); + } + + return result; +} + +} // namespace fluss diff --git a/fluss-rust/bindings/cpp/src/connection.cpp b/fluss-rust/bindings/cpp/src/connection.cpp new file mode 100644 index 0000000000..ea884cdb1f --- /dev/null +++ b/fluss-rust/bindings/cpp/src/connection.cpp @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "fluss.hpp" +#include "lib.rs.h" +#include "ffi_converter.hpp" +#include "rust/cxx.h" + +namespace fluss { + +Connection::Connection() noexcept = default; + +Connection::~Connection() noexcept { Destroy(); } + +void Connection::Destroy() noexcept { + if (conn_) { + ffi::delete_connection(conn_); + conn_ = nullptr; + } +} + +Connection::Connection(Connection&& other) noexcept : conn_(other.conn_) { + other.conn_ = nullptr; +} + +Connection& Connection::operator=(Connection&& other) noexcept { + if (this != &other) { + Destroy(); + conn_ = other.conn_; + other.conn_ = nullptr; + } + return *this; +} + +Result Connection::Connect(const std::string& bootstrap_server, Connection& out) { + try { + out.conn_ = ffi::new_connection(bootstrap_server); + return utils::make_ok(); + } catch (const rust::Error& e) { + return utils::make_error(1, e.what()); + } catch (const std::exception& e) { + return utils::make_error(1, e.what()); + } +} + +bool Connection::Available() const { return conn_ != nullptr; } + +Result Connection::GetAdmin(Admin& out) { + if (!Available()) { + return utils::make_error(1, "Connection not available"); + } + + try { + out.admin_ = conn_->get_admin(); + return utils::make_ok(); + } catch (const rust::Error& e) { + return utils::make_error(1, e.what()); + } catch (const std::exception& e) { + return utils::make_error(1, e.what()); + } +} + +Result Connection::GetTable(const TablePath& table_path, Table& out) { + if (!Available()) { + return utils::make_error(1, "Connection not available"); + } + + try { + auto ffi_path = utils::to_ffi_table_path(table_path); + out.table_ = conn_->get_table(ffi_path); + return utils::make_ok(); + } catch (const rust::Error& e) { + return utils::make_error(1, e.what()); + } catch (const std::exception& e) { + return utils::make_error(1, e.what()); + } +} + +} // namespace fluss diff --git a/fluss-rust/bindings/cpp/src/ffi_converter.hpp b/fluss-rust/bindings/cpp/src/ffi_converter.hpp new file mode 100644 index 0000000000..52dd7fe5d4 --- /dev/null +++ b/fluss-rust/bindings/cpp/src/ffi_converter.hpp @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include "fluss.hpp" +#include "lib.rs.h" + +namespace fluss { +namespace utils { + +inline Result make_error(int32_t code, std::string msg) { + return Result{code, std::move(msg)}; +} + +inline Result make_ok() { + return Result{0, {}}; +} + +inline Result from_ffi_result(const ffi::FfiResult& ffi_result) { + return Result{ffi_result.error_code, std::string(ffi_result.error_message)}; +} + +inline ffi::FfiTablePath to_ffi_table_path(const TablePath& path) { + ffi::FfiTablePath ffi_path; + ffi_path.database_name = rust::String(path.database_name); + ffi_path.table_name = rust::String(path.table_name); + return ffi_path; +} + +inline ffi::FfiColumn to_ffi_column(const Column& col) { + ffi::FfiColumn ffi_col; + ffi_col.name = rust::String(col.name); + ffi_col.data_type = static_cast(col.data_type); + ffi_col.comment = rust::String(col.comment); + return ffi_col; +} + +inline ffi::FfiSchema to_ffi_schema(const Schema& schema) { + ffi::FfiSchema ffi_schema; + + rust::Vec cols; + for (const auto& col : schema.columns) { + cols.push_back(to_ffi_column(col)); + } + ffi_schema.columns = std::move(cols); + + rust::Vec pks; + for (const auto& pk : schema.primary_keys) { + pks.push_back(rust::String(pk)); + } + ffi_schema.primary_keys = std::move(pks); + + return ffi_schema; +} + +inline ffi::FfiTableDescriptor to_ffi_table_descriptor(const TableDescriptor& desc) { + ffi::FfiTableDescriptor ffi_desc; + + ffi_desc.schema = to_ffi_schema(desc.schema); + + rust::Vec partition_keys; + for (const auto& pk : desc.partition_keys) { + partition_keys.push_back(rust::String(pk)); + } + ffi_desc.partition_keys = std::move(partition_keys); + + ffi_desc.bucket_count = desc.bucket_count; + + rust::Vec bucket_keys; + for (const auto& bk : desc.bucket_keys) { + bucket_keys.push_back(rust::String(bk)); + } + ffi_desc.bucket_keys = std::move(bucket_keys); + + rust::Vec props; + for (const auto& [k, v] : desc.properties) { + ffi::HashMapValue prop; + prop.key = rust::String(k); + prop.value = rust::String(v); + props.push_back(prop); + } + ffi_desc.properties = std::move(props); + + ffi_desc.comment = rust::String(desc.comment); + + return ffi_desc; +} + +inline ffi::FfiDatum to_ffi_datum(const Datum& datum) { + ffi::FfiDatum ffi_datum; + ffi_datum.datum_type = static_cast(datum.type); + ffi_datum.bool_val = datum.bool_val; + ffi_datum.i32_val = datum.i32_val; + ffi_datum.i64_val = datum.i64_val; + ffi_datum.f32_val = datum.f32_val; + ffi_datum.f64_val = datum.f64_val; + ffi_datum.string_val = rust::String(datum.string_val); + + rust::Vec bytes; + for (auto b : datum.bytes_val) { + bytes.push_back(b); + } + ffi_datum.bytes_val = std::move(bytes); + + return ffi_datum; +} + +inline ffi::FfiGenericRow to_ffi_generic_row(const GenericRow& row) { + ffi::FfiGenericRow ffi_row; + + rust::Vec fields; + for (const auto& field : row.fields) { + fields.push_back(to_ffi_datum(field)); + } + ffi_row.fields = std::move(fields); + + return ffi_row; +} + +inline Column from_ffi_column(const ffi::FfiColumn& ffi_col) { + return Column{ + std::string(ffi_col.name), + static_cast(ffi_col.data_type), + std::string(ffi_col.comment)}; +} + +inline Schema from_ffi_schema(const ffi::FfiSchema& ffi_schema) { + Schema schema; + + for (const auto& col : ffi_schema.columns) { + schema.columns.push_back(from_ffi_column(col)); + } + + for (const auto& pk : ffi_schema.primary_keys) { + schema.primary_keys.push_back(std::string(pk)); + } + + return schema; +} + +inline TableInfo from_ffi_table_info(const ffi::FfiTableInfo& ffi_info) { + TableInfo info; + + info.table_id = ffi_info.table_id; + info.schema_id = ffi_info.schema_id; + info.table_path = TablePath{ + std::string(ffi_info.table_path.database_name), + std::string(ffi_info.table_path.table_name)}; + info.created_time = ffi_info.created_time; + info.modified_time = ffi_info.modified_time; + + for (const auto& pk : ffi_info.primary_keys) { + info.primary_keys.push_back(std::string(pk)); + } + + for (const auto& bk : ffi_info.bucket_keys) { + info.bucket_keys.push_back(std::string(bk)); + } + + for (const auto& pk : ffi_info.partition_keys) { + info.partition_keys.push_back(std::string(pk)); + } + + info.num_buckets = ffi_info.num_buckets; + info.has_primary_key = ffi_info.has_primary_key; + info.is_partitioned = ffi_info.is_partitioned; + + for (const auto& prop : ffi_info.properties) { + info.properties[std::string(prop.key)] = std::string(prop.value); + } + + info.comment = std::string(ffi_info.comment); + info.schema = from_ffi_schema(ffi_info.schema); + + return info; +} + +inline Datum from_ffi_datum(const ffi::FfiDatum& ffi_datum) { + Datum datum; + datum.type = static_cast(ffi_datum.datum_type); + datum.bool_val = ffi_datum.bool_val; + datum.i32_val = ffi_datum.i32_val; + datum.i64_val = ffi_datum.i64_val; + datum.f32_val = ffi_datum.f32_val; + datum.f64_val = ffi_datum.f64_val; + // todo: avoid copy string + datum.string_val = std::string(ffi_datum.string_val); + + for (auto b : ffi_datum.bytes_val) { + datum.bytes_val.push_back(b); + } + + return datum; +} + +inline GenericRow from_ffi_generic_row(const ffi::FfiGenericRow& ffi_row) { + GenericRow row; + + for (const auto& field : ffi_row.fields) { + row.fields.push_back(from_ffi_datum(field)); + } + + return row; +} + +inline ScanRecord from_ffi_scan_record(const ffi::FfiScanRecord& ffi_record) { + return ScanRecord{ + ffi_record.offset, + ffi_record.timestamp, + from_ffi_generic_row(ffi_record.row)}; +} + +inline ScanRecords from_ffi_scan_records(const ffi::FfiScanRecords& ffi_records) { + ScanRecords records; + + for (const auto& record : ffi_records.records) { + records.records.push_back(from_ffi_scan_record(record)); + } + + return records; +} + +inline LakeSnapshot from_ffi_lake_snapshot(const ffi::FfiLakeSnapshot& ffi_snapshot) { + LakeSnapshot snapshot; + snapshot.snapshot_id = ffi_snapshot.snapshot_id; + + for (const auto& offset : ffi_snapshot.bucket_offsets) { + snapshot.bucket_offsets.push_back(BucketOffset{ + offset.table_id, + offset.partition_id, + offset.bucket_id, + offset.offset}); + } + + return snapshot; +} + +} // namespace utils +} // namespace fluss diff --git a/fluss-rust/bindings/cpp/src/lib.rs b/fluss-rust/bindings/cpp/src/lib.rs new file mode 100644 index 0000000000..3e883e29c6 --- /dev/null +++ b/fluss-rust/bindings/cpp/src/lib.rs @@ -0,0 +1,523 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod types; + +use std::sync::{Arc, LazyLock}; +use std::time::Duration; + +use fluss as fcore; + +static RUNTIME: LazyLock = LazyLock::new(|| { + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap() +}); + +#[cxx::bridge(namespace = "fluss::ffi")] +mod ffi { + struct HashMapValue { + key: String, + value: String, + } + + struct FfiResult { + error_code: i32, + error_message: String, + } + + struct FfiTablePath { + database_name: String, + table_name: String, + } + + struct FfiColumn { + name: String, + data_type: i32, + comment: String, + } + + struct FfiSchema { + columns: Vec, + primary_keys: Vec, + } + + struct FfiTableDescriptor { + schema: FfiSchema, + partition_keys: Vec, + bucket_count: i32, + bucket_keys: Vec, + properties: Vec, + comment: String, + } + + struct FfiTableInfo { + table_id: i64, + schema_id: i32, + table_path: FfiTablePath, + created_time: i64, + modified_time: i64, + primary_keys: Vec, + bucket_keys: Vec, + partition_keys: Vec, + num_buckets: i32, + has_primary_key: bool, + is_partitioned: bool, + properties: Vec, + comment: String, + schema: FfiSchema, + } + + struct FfiTableInfoResult { + result: FfiResult, + table_info: FfiTableInfo, + } + + struct FfiDatum { + datum_type: i32, + bool_val: bool, + i32_val: i32, + i64_val: i64, + f32_val: f32, + f64_val: f64, + string_val: String, + bytes_val: Vec, + } + + struct FfiGenericRow { + fields: Vec, + } + + struct FfiScanRecord { + offset: i64, + timestamp: i64, + row: FfiGenericRow, + } + + struct FfiScanRecords { + records: Vec, + } + + struct FfiScanRecordsResult { + result: FfiResult, + scan_records: FfiScanRecords, + } + + struct FfiLakeSnapshot { + snapshot_id: i64, + bucket_offsets: Vec, + } + + struct FfiBucketOffset { + table_id: i64, + partition_id: i64, + bucket_id: i32, + offset: i64, + } + + struct FfiLakeSnapshotResult { + result: FfiResult, + lake_snapshot: FfiLakeSnapshot, + } + + extern "Rust" { + type Connection; + type Admin; + type Table; + type AppendWriter; + type LogScanner; + + // Connection + fn new_connection(bootstrap_server: &str) -> Result<*mut Connection>; + unsafe fn delete_connection(conn: *mut Connection); + fn get_admin(self: &Connection) -> Result<*mut Admin>; + fn get_table(self: &Connection, table_path: &FfiTablePath) -> Result<*mut Table>; + + // Admin + unsafe fn delete_admin(admin: *mut Admin); + fn create_table( + self: &Admin, + table_path: &FfiTablePath, + descriptor: &FfiTableDescriptor, + ignore_if_exists: bool, + ) -> FfiResult; + fn get_table_info(self: &Admin, table_path: &FfiTablePath) -> FfiTableInfoResult; + fn get_latest_lake_snapshot( + self: &Admin, + table_path: &FfiTablePath, + ) -> FfiLakeSnapshotResult; + + // Table + unsafe fn delete_table(table: *mut Table); + fn new_append_writer(self: &Table) -> Result<*mut AppendWriter>; + fn new_log_scanner(self: &Table) -> Result<*mut LogScanner>; + fn new_log_scanner_with_projection( + self: &Table, + column_indices: Vec, + ) -> Result<*mut LogScanner>; + fn get_table_info_from_table(self: &Table) -> FfiTableInfo; + fn get_table_path(self: &Table) -> FfiTablePath; + fn has_primary_key(self: &Table) -> bool; + + // AppendWriter + unsafe fn delete_append_writer(writer: *mut AppendWriter); + fn append(self: &mut AppendWriter, row: &FfiGenericRow) -> FfiResult; + fn flush(self: &mut AppendWriter) -> FfiResult; + + // LogScanner + unsafe fn delete_log_scanner(scanner: *mut LogScanner); + fn subscribe(self: &LogScanner, bucket_id: i32, start_offset: i64) -> FfiResult; + fn poll(self: &LogScanner, timeout_ms: i64) -> FfiScanRecordsResult; + } +} + +pub struct Connection { + inner: Arc, + #[allow(dead_code)] + metadata: Option>, +} + +pub struct Admin { + inner: fcore::client::FlussAdmin, +} + +pub struct Table { + connection: Arc, + metadata: Arc, + table_info: fcore::metadata::TableInfo, + table_path: fcore::metadata::TablePath, + has_pk: bool, +} + +pub struct AppendWriter { + inner: fcore::client::AppendWriter, +} + +pub struct LogScanner { + inner: fcore::client::LogScanner, +} + +fn ok_result() -> ffi::FfiResult { + ffi::FfiResult { + error_code: 0, + error_message: String::new(), + } +} + +fn err_result(code: i32, msg: String) -> ffi::FfiResult { + ffi::FfiResult { + error_code: code, + error_message: msg, + } +} + +// Connection implementation +fn new_connection(bootstrap_server: &str) -> Result<*mut Connection, String> { + let config = fluss::config::Config { + bootstrap_server: Some(bootstrap_server.to_string()), + ..Default::default() + }; + + let conn = RUNTIME.block_on(async { fcore::client::FlussConnection::new(config).await }); + + match conn { + Ok(c) => { + let conn = Box::into_raw(Box::new(Connection { + inner: Arc::new(c), + metadata: None, + })); + Ok(conn) + } + Err(e) => Err(format!("Failed to connect: {}", e)), + } +} + +unsafe fn delete_connection(conn: *mut Connection) { + if !conn.is_null() { + unsafe { + drop(Box::from_raw(conn)); + } + } +} + +impl Connection { + fn get_admin(&self) -> Result<*mut Admin, String> { + let admin_result = RUNTIME.block_on(async { self.inner.get_admin().await }); + + match admin_result { + Ok(admin) => { + let admin = Box::into_raw(Box::new(Admin { inner: admin })); + Ok(admin) + } + Err(e) => Err(format!("Failed to get admin: {}", e)), + } + } + + fn get_table(&self, table_path: &ffi::FfiTablePath) -> Result<*mut Table, String> { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let table_result = RUNTIME.block_on(async { self.inner.get_table(&path).await }); + + match table_result { + Ok(t) => { + let table = Box::into_raw(Box::new(Table { + connection: self.inner.clone(), + metadata: t.metadata().clone(), + table_info: t.table_info().clone(), + table_path: t.table_path().clone(), + has_pk: t.has_primary_key(), + })); + Ok(table) + } + Err(e) => Err(format!("Failed to get table: {}", e)), + } + } +} + +// Admin implementation +unsafe fn delete_admin(admin: *mut Admin) { + if !admin.is_null() { + unsafe { + drop(Box::from_raw(admin)); + } + } +} + +impl Admin { + fn create_table( + &self, + table_path: &ffi::FfiTablePath, + descriptor: &ffi::FfiTableDescriptor, + ignore_if_exists: bool, + ) -> ffi::FfiResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let core_descriptor = match types::ffi_descriptor_to_core(descriptor) { + Ok(d) => d, + Err(e) => return err_result(1, e.to_string()), + }; + + let result = RUNTIME.block_on(async { + self.inner + .create_table(&path, &core_descriptor, ignore_if_exists) + .await + }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_result(2, e.to_string()), + } + } + + fn get_table_info(&self, table_path: &ffi::FfiTablePath) -> ffi::FfiTableInfoResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let result = RUNTIME.block_on(async { self.inner.get_table(&path).await }); + + match result { + Ok(info) => ffi::FfiTableInfoResult { + result: ok_result(), + table_info: types::core_table_info_to_ffi(&info), + }, + Err(e) => ffi::FfiTableInfoResult { + result: err_result(1, e.to_string()), + table_info: types::empty_table_info(), + }, + } + } + + fn get_latest_lake_snapshot( + &self, + table_path: &ffi::FfiTablePath, + ) -> ffi::FfiLakeSnapshotResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let result = RUNTIME.block_on(async { self.inner.get_latest_lake_snapshot(&path).await }); + + match result { + Ok(snapshot) => ffi::FfiLakeSnapshotResult { + result: ok_result(), + lake_snapshot: types::core_lake_snapshot_to_ffi(&snapshot), + }, + Err(e) => ffi::FfiLakeSnapshotResult { + result: err_result(1, e.to_string()), + lake_snapshot: ffi::FfiLakeSnapshot { + snapshot_id: -1, + bucket_offsets: vec![], + }, + }, + } + } +} + +// Table implementation +unsafe fn delete_table(table: *mut Table) { + if !table.is_null() { + unsafe { + drop(Box::from_raw(table)); + } + } +} + +impl Table { + fn new_append_writer(&self) -> Result<*mut AppendWriter, String> { + let _enter = RUNTIME.enter(); + + let fluss_table = fcore::client::FlussTable::new( + &self.connection, + self.metadata.clone(), + self.table_info.clone(), + ); + + let table_append = match fluss_table.new_append() { + Ok(a) => a, + Err(e) => return Err(format!("Failed to create append: {}", e)), + }; + + let writer = table_append.create_writer(); + let writer = Box::into_raw(Box::new(AppendWriter { inner: writer })); + Ok(writer) + } + + fn new_log_scanner(&self) -> Result<*mut LogScanner, String> { + let fluss_table = fcore::client::FlussTable::new( + &self.connection, + self.metadata.clone(), + self.table_info.clone(), + ); + + let scanner = fluss_table.new_scan().create_log_scanner(); + let scanner = Box::into_raw(Box::new(LogScanner { inner: scanner })); + Ok(scanner) + } + + fn new_log_scanner_with_projection( + &self, + column_indices: Vec, + ) -> Result<*mut LogScanner, String> { + let fluss_table = fcore::client::FlussTable::new( + &self.connection, + self.metadata.clone(), + self.table_info.clone(), + ); + + let scan = fluss_table.new_scan(); + let scan = match scan.project(&column_indices) { + Ok(s) => s, + Err(e) => return Err(format!("Failed to project columns: {}", e)), + }; + let scanner = scan.create_log_scanner(); + let scanner = Box::into_raw(Box::new(LogScanner { inner: scanner })); + Ok(scanner) + } + + fn get_table_info_from_table(&self) -> ffi::FfiTableInfo { + types::core_table_info_to_ffi(&self.table_info) + } + + fn get_table_path(&self) -> ffi::FfiTablePath { + ffi::FfiTablePath { + database_name: self.table_path.database().to_string(), + table_name: self.table_path.table().to_string(), + } + } + + fn has_primary_key(&self) -> bool { + self.has_pk + } +} + +// AppendWriter implementation +unsafe fn delete_append_writer(writer: *mut AppendWriter) { + if !writer.is_null() { + unsafe { + drop(Box::from_raw(writer)); + } + } +} + +impl AppendWriter { + fn append(&mut self, row: &ffi::FfiGenericRow) -> ffi::FfiResult { + let generic_row = types::ffi_row_to_core(row); + + let result = RUNTIME.block_on(async { self.inner.append(generic_row).await }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_result(1, e.to_string()), + } + } + + fn flush(&mut self) -> ffi::FfiResult { + let result = RUNTIME.block_on(async { self.inner.flush().await }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_result(1, e.to_string()), + } + } +} + +// LogScanner implementation +unsafe fn delete_log_scanner(scanner: *mut LogScanner) { + if !scanner.is_null() { + unsafe { + drop(Box::from_raw(scanner)); + } + } +} + +impl LogScanner { + fn subscribe(&self, bucket_id: i32, start_offset: i64) -> ffi::FfiResult { + let result = + RUNTIME.block_on(async { self.inner.subscribe(bucket_id, start_offset).await }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_result(1, e.to_string()), + } + } + + fn poll(&self, timeout_ms: i64) -> ffi::FfiScanRecordsResult { + let timeout = Duration::from_millis(timeout_ms as u64); + let result = RUNTIME.block_on(async { self.inner.poll(timeout).await }); + + match result { + Ok(records) => ffi::FfiScanRecordsResult { + result: ok_result(), + scan_records: types::core_scan_records_to_ffi(&records), + }, + Err(e) => ffi::FfiScanRecordsResult { + result: err_result(1, e.to_string()), + scan_records: ffi::FfiScanRecords { records: vec![] }, + }, + } + } +} diff --git a/fluss-rust/bindings/cpp/src/table.cpp b/fluss-rust/bindings/cpp/src/table.cpp new file mode 100644 index 0000000000..b28b783ee8 --- /dev/null +++ b/fluss-rust/bindings/cpp/src/table.cpp @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "fluss.hpp" +#include "lib.rs.h" +#include "ffi_converter.hpp" +#include "rust/cxx.h" + +namespace fluss { + +Table::Table() noexcept = default; + +Table::Table(ffi::Table* table) noexcept : table_(table) {} + +Table::~Table() noexcept { Destroy(); } + +void Table::Destroy() noexcept { + if (table_) { + ffi::delete_table(table_); + table_ = nullptr; + } +} + +Table::Table(Table&& other) noexcept : table_(other.table_) { + other.table_ = nullptr; +} + +Table& Table::operator=(Table&& other) noexcept { + if (this != &other) { + Destroy(); + table_ = other.table_; + other.table_ = nullptr; + } + return *this; +} + +bool Table::Available() const { return table_ != nullptr; } + +Result Table::NewAppendWriter(AppendWriter& out) { + if (!Available()) { + return utils::make_error(1, "Table not available"); + } + + try { + out.writer_ = table_->new_append_writer(); + return utils::make_ok(); + } catch (const rust::Error& e) { + return utils::make_error(1, e.what()); + } catch (const std::exception& e) { + return utils::make_error(1, e.what()); + } +} + +Result Table::NewLogScanner(LogScanner& out) { + if (!Available()) { + return utils::make_error(1, "Table not available"); + } + + try { + out.scanner_ = table_->new_log_scanner(); + return utils::make_ok(); + } catch (const rust::Error& e) { + return utils::make_error(1, e.what()); + } catch (const std::exception& e) { + return utils::make_error(1, e.what()); + } +} + +Result Table::NewLogScannerWithProjection(const std::vector& column_indices, LogScanner& out) { + if (!Available()) { + return utils::make_error(1, "Table not available"); + } + + try { + rust::Vec rust_indices; + for (size_t idx : column_indices) { + rust_indices.push_back(idx); + } + out.scanner_ = table_->new_log_scanner_with_projection(std::move(rust_indices)); + return utils::make_ok(); + } catch (const rust::Error& e) { + return utils::make_error(1, e.what()); + } catch (const std::exception& e) { + return utils::make_error(1, e.what()); + } +} + +TableInfo Table::GetTableInfo() const { + if (!Available()) { + return TableInfo{}; + } + auto ffi_info = table_->get_table_info_from_table(); + return utils::from_ffi_table_info(ffi_info); +} + +TablePath Table::GetTablePath() const { + if (!Available()) { + return TablePath{}; + } + auto ffi_path = table_->get_table_path(); + return TablePath{std::string(ffi_path.database_name), std::string(ffi_path.table_name)}; +} + +bool Table::HasPrimaryKey() const { + if (!Available()) { + return false; + } + return table_->has_primary_key(); +} + +// AppendWriter implementation +AppendWriter::AppendWriter() noexcept = default; + +AppendWriter::AppendWriter(ffi::AppendWriter* writer) noexcept : writer_(writer) {} + +AppendWriter::~AppendWriter() noexcept { Destroy(); } + +void AppendWriter::Destroy() noexcept { + if (writer_) { + ffi::delete_append_writer(writer_); + writer_ = nullptr; + } +} + +AppendWriter::AppendWriter(AppendWriter&& other) noexcept : writer_(other.writer_) { + other.writer_ = nullptr; +} + +AppendWriter& AppendWriter::operator=(AppendWriter&& other) noexcept { + if (this != &other) { + Destroy(); + writer_ = other.writer_; + other.writer_ = nullptr; + } + return *this; +} + +bool AppendWriter::Available() const { return writer_ != nullptr; } + +Result AppendWriter::Append(const GenericRow& row) { + if (!Available()) { + return utils::make_error(1, "AppendWriter not available"); + } + + auto ffi_row = utils::to_ffi_generic_row(row); + auto ffi_result = writer_->append(ffi_row); + return utils::from_ffi_result(ffi_result); +} + +Result AppendWriter::Flush() { + if (!Available()) { + return utils::make_error(1, "AppendWriter not available"); + } + + auto ffi_result = writer_->flush(); + return utils::from_ffi_result(ffi_result); +} + +// LogScanner implementation +LogScanner::LogScanner() noexcept = default; + +LogScanner::LogScanner(ffi::LogScanner* scanner) noexcept : scanner_(scanner) {} + +LogScanner::~LogScanner() noexcept { Destroy(); } + +void LogScanner::Destroy() noexcept { + if (scanner_) { + ffi::delete_log_scanner(scanner_); + scanner_ = nullptr; + } +} + +LogScanner::LogScanner(LogScanner&& other) noexcept : scanner_(other.scanner_) { + other.scanner_ = nullptr; +} + +LogScanner& LogScanner::operator=(LogScanner&& other) noexcept { + if (this != &other) { + Destroy(); + scanner_ = other.scanner_; + other.scanner_ = nullptr; + } + return *this; +} + +bool LogScanner::Available() const { return scanner_ != nullptr; } + +Result LogScanner::Subscribe(int32_t bucket_id, int64_t start_offset) { + if (!Available()) { + return utils::make_error(1, "LogScanner not available"); + } + + auto ffi_result = scanner_->subscribe(bucket_id, start_offset); + return utils::from_ffi_result(ffi_result); +} + +Result LogScanner::Poll(int64_t timeout_ms, ScanRecords& out) { + if (!Available()) { + return utils::make_error(1, "LogScanner not available"); + } + + auto ffi_result = scanner_->poll(timeout_ms); + auto result = utils::from_ffi_result(ffi_result.result); + if (!result.Ok()) { + return result; + } + + out = utils::from_ffi_scan_records(ffi_result.scan_records); + return utils::make_ok(); +} + +} // namespace fluss diff --git a/fluss-rust/bindings/cpp/src/types.rs b/fluss-rust/bindings/cpp/src/types.rs new file mode 100644 index 0000000000..d3bab38abe --- /dev/null +++ b/fluss-rust/bindings/cpp/src/types.rs @@ -0,0 +1,485 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::ffi; +use anyhow::{Result, anyhow}; +use arrow::array::{ + Date32Array, LargeBinaryArray, LargeStringArray, Time32MillisecondArray, Time32SecondArray, + Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, +}; +use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; +use fcore::row::InternalRow; +use fluss as fcore; + +pub const DATA_TYPE_BOOLEAN: i32 = 1; +pub const DATA_TYPE_TINYINT: i32 = 2; +pub const DATA_TYPE_SMALLINT: i32 = 3; +pub const DATA_TYPE_INT: i32 = 4; +pub const DATA_TYPE_BIGINT: i32 = 5; +pub const DATA_TYPE_FLOAT: i32 = 6; +pub const DATA_TYPE_DOUBLE: i32 = 7; +pub const DATA_TYPE_STRING: i32 = 8; +pub const DATA_TYPE_BYTES: i32 = 9; +pub const DATA_TYPE_DATE: i32 = 10; +pub const DATA_TYPE_TIME: i32 = 11; +pub const DATA_TYPE_TIMESTAMP: i32 = 12; +pub const DATA_TYPE_TIMESTAMP_LTZ: i32 = 13; + +pub const DATUM_TYPE_NULL: i32 = 0; +pub const DATUM_TYPE_BOOL: i32 = 1; +pub const DATUM_TYPE_INT32: i32 = 2; +pub const DATUM_TYPE_INT64: i32 = 3; +pub const DATUM_TYPE_FLOAT32: i32 = 4; +pub const DATUM_TYPE_FLOAT64: i32 = 5; +pub const DATUM_TYPE_STRING: i32 = 6; +pub const DATUM_TYPE_BYTES: i32 = 7; + +fn ffi_data_type_to_core(dt: i32) -> Result { + match dt { + DATA_TYPE_BOOLEAN => Ok(fcore::metadata::DataTypes::boolean()), + DATA_TYPE_TINYINT => Ok(fcore::metadata::DataTypes::tinyint()), + DATA_TYPE_SMALLINT => Ok(fcore::metadata::DataTypes::smallint()), + DATA_TYPE_INT => Ok(fcore::metadata::DataTypes::int()), + DATA_TYPE_BIGINT => Ok(fcore::metadata::DataTypes::bigint()), + DATA_TYPE_FLOAT => Ok(fcore::metadata::DataTypes::float()), + DATA_TYPE_DOUBLE => Ok(fcore::metadata::DataTypes::double()), + DATA_TYPE_STRING => Ok(fcore::metadata::DataTypes::string()), + DATA_TYPE_BYTES => Ok(fcore::metadata::DataTypes::bytes()), + DATA_TYPE_DATE => Ok(fcore::metadata::DataTypes::date()), + DATA_TYPE_TIME => Ok(fcore::metadata::DataTypes::time()), + DATA_TYPE_TIMESTAMP => Ok(fcore::metadata::DataTypes::timestamp()), + DATA_TYPE_TIMESTAMP_LTZ => Ok(fcore::metadata::DataTypes::timestamp_ltz()), + _ => Err(anyhow!("Unknown data type: {}", dt)), + } +} + +fn core_data_type_to_ffi(dt: &fcore::metadata::DataType) -> i32 { + match dt { + fcore::metadata::DataType::Boolean(_) => DATA_TYPE_BOOLEAN, + fcore::metadata::DataType::TinyInt(_) => DATA_TYPE_TINYINT, + fcore::metadata::DataType::SmallInt(_) => DATA_TYPE_SMALLINT, + fcore::metadata::DataType::Int(_) => DATA_TYPE_INT, + fcore::metadata::DataType::BigInt(_) => DATA_TYPE_BIGINT, + fcore::metadata::DataType::Float(_) => DATA_TYPE_FLOAT, + fcore::metadata::DataType::Double(_) => DATA_TYPE_DOUBLE, + fcore::metadata::DataType::String(_) => DATA_TYPE_STRING, + fcore::metadata::DataType::Bytes(_) => DATA_TYPE_BYTES, + fcore::metadata::DataType::Date(_) => DATA_TYPE_DATE, + fcore::metadata::DataType::Time(_) => DATA_TYPE_TIME, + fcore::metadata::DataType::Timestamp(_) => DATA_TYPE_TIMESTAMP, + fcore::metadata::DataType::TimestampLTz(_) => DATA_TYPE_TIMESTAMP_LTZ, + _ => 0, + } +} + +pub fn ffi_descriptor_to_core( + descriptor: &ffi::FfiTableDescriptor, +) -> Result { + let mut schema_builder = fcore::metadata::Schema::builder(); + + for col in &descriptor.schema.columns { + let dt = ffi_data_type_to_core(col.data_type)?; + schema_builder = schema_builder.column(&col.name, dt); + if !col.comment.is_empty() { + schema_builder = schema_builder.with_comment(&col.comment); + } + } + + if !descriptor.schema.primary_keys.is_empty() { + schema_builder = schema_builder.primary_key(descriptor.schema.primary_keys.clone()); + } + + let schema = schema_builder.build()?; + + let mut builder = fcore::metadata::TableDescriptor::builder() + .schema(schema) + .partitioned_by(descriptor.partition_keys.clone()); + + if descriptor.bucket_count > 0 { + builder = builder.distributed_by( + Some(descriptor.bucket_count), + descriptor.bucket_keys.clone(), + ); + } else { + builder = builder.distributed_by(None, descriptor.bucket_keys.clone()); + } + + for prop in &descriptor.properties { + builder = builder.property(&prop.key, &prop.value); + } + + if !descriptor.comment.is_empty() { + builder = builder.comment(&descriptor.comment); + } + + Ok(builder.build()?) +} + +pub fn core_table_info_to_ffi(info: &fcore::metadata::TableInfo) -> ffi::FfiTableInfo { + let schema = info.get_schema(); + let columns: Vec = schema + .columns() + .iter() + .map(|col| ffi::FfiColumn { + name: col.name().to_string(), + data_type: core_data_type_to_ffi(col.data_type()), + comment: col.comment().unwrap_or("").to_string(), + }) + .collect(); + + let primary_keys: Vec = schema + .primary_key() + .map(|pk| pk.column_names().to_vec()) + .unwrap_or_default(); + + let properties: Vec = info + .get_properties() + .iter() + .map(|(k, v)| ffi::HashMapValue { + key: k.clone(), + value: v.clone(), + }) + .collect(); + + ffi::FfiTableInfo { + table_id: info.get_table_id(), + schema_id: info.get_schema_id(), + table_path: ffi::FfiTablePath { + database_name: info.get_table_path().database().to_string(), + table_name: info.get_table_path().table().to_string(), + }, + created_time: info.get_created_time(), + modified_time: info.get_modified_time(), + primary_keys: info.get_primary_keys().clone(), + bucket_keys: info.get_bucket_keys().to_vec(), + partition_keys: info.get_partition_keys().to_vec(), + num_buckets: info.get_num_buckets(), + has_primary_key: info.has_primary_key(), + is_partitioned: info.is_partitioned(), + properties, + comment: info.get_comment().unwrap_or("").to_string(), + schema: ffi::FfiSchema { + columns, + primary_keys, + }, + } +} + +pub fn empty_table_info() -> ffi::FfiTableInfo { + ffi::FfiTableInfo { + table_id: 0, + schema_id: 0, + table_path: ffi::FfiTablePath { + database_name: String::new(), + table_name: String::new(), + }, + created_time: 0, + modified_time: 0, + primary_keys: vec![], + bucket_keys: vec![], + partition_keys: vec![], + num_buckets: 0, + has_primary_key: false, + is_partitioned: false, + properties: vec![], + comment: String::new(), + schema: ffi::FfiSchema { + columns: vec![], + primary_keys: vec![], + }, + } +} + +pub fn ffi_row_to_core(row: &ffi::FfiGenericRow) -> fcore::row::GenericRow<'_> { + use fcore::row::Datum; + + let mut generic_row = fcore::row::GenericRow::new(); + + for (idx, field) in row.fields.iter().enumerate() { + let datum = match field.datum_type { + DATUM_TYPE_NULL => Datum::Null, + DATUM_TYPE_BOOL => Datum::Bool(field.bool_val), + DATUM_TYPE_INT32 => Datum::Int32(field.i32_val), + DATUM_TYPE_INT64 => Datum::Int64(field.i64_val), + DATUM_TYPE_FLOAT32 => Datum::Float32(field.f32_val.into()), + DATUM_TYPE_FLOAT64 => Datum::Float64(field.f64_val.into()), + DATUM_TYPE_STRING => Datum::String(field.string_val.as_str()), + // todo: avoid copy bytes for blob + DATUM_TYPE_BYTES => Datum::Blob(field.bytes_val.clone().into()), + _ => Datum::Null, + }; + generic_row.set_field(idx, datum); + } + + generic_row +} + +pub fn core_scan_records_to_ffi(records: &fcore::record::ScanRecords) -> ffi::FfiScanRecords { + let mut ffi_records = Vec::new(); + + // Iterate over all buckets and their records + for bucket_records in records.records_by_buckets().values() { + for record in bucket_records { + let row = record.row(); + let fields = core_row_to_ffi_fields(row); + + ffi_records.push(ffi::FfiScanRecord { + offset: record.offset(), + timestamp: record.timestamp(), + row: ffi::FfiGenericRow { fields }, + }); + } + } + + ffi::FfiScanRecords { + records: ffi_records, + } +} + +fn core_row_to_ffi_fields(row: &fcore::row::ColumnarRow) -> Vec { + fn new_datum(datum_type: i32) -> ffi::FfiDatum { + ffi::FfiDatum { + datum_type, + bool_val: false, + i32_val: 0, + i64_val: 0, + f32_val: 0.0, + f64_val: 0.0, + string_val: String::new(), + bytes_val: vec![], + } + } + + let record_batch = row.get_record_batch(); + let schema = record_batch.schema(); + let row_id = row.get_row_id(); + + let mut fields = Vec::with_capacity(schema.fields().len()); + + for (i, field) in schema.fields().iter().enumerate() { + if row.is_null_at(i) { + fields.push(new_datum(DATUM_TYPE_NULL)); + continue; + } + + let datum = match field.data_type() { + ArrowDataType::Boolean => { + let mut datum = new_datum(DATUM_TYPE_BOOL); + datum.bool_val = row.get_boolean(i); + datum + } + ArrowDataType::Int8 => { + let mut datum = new_datum(DATUM_TYPE_INT32); + datum.i32_val = row.get_byte(i) as i32; + datum + } + ArrowDataType::Int16 => { + let mut datum = new_datum(DATUM_TYPE_INT32); + datum.i32_val = row.get_short(i) as i32; + datum + } + ArrowDataType::Int32 => { + let mut datum = new_datum(DATUM_TYPE_INT32); + datum.i32_val = row.get_int(i); + datum + } + ArrowDataType::Int64 => { + let mut datum = new_datum(DATUM_TYPE_INT64); + datum.i64_val = row.get_long(i); + datum + } + ArrowDataType::Float32 => { + let mut datum = new_datum(DATUM_TYPE_FLOAT32); + datum.f32_val = row.get_float(i); + datum + } + ArrowDataType::Float64 => { + let mut datum = new_datum(DATUM_TYPE_FLOAT64); + datum.f64_val = row.get_double(i); + datum + } + ArrowDataType::Utf8 => { + let mut datum = new_datum(DATUM_TYPE_STRING); + // todo: avoid copy string + datum.string_val = row.get_string(i).to_string(); + datum + } + ArrowDataType::LargeUtf8 => { + let array = record_batch + .column(i) + .as_any() + .downcast_ref::() + .expect("LargeUtf8 column expected"); + let mut datum = new_datum(DATUM_TYPE_STRING); + datum.string_val = array.value(row_id).to_string(); + datum + } + ArrowDataType::Binary => { + let mut datum = new_datum(DATUM_TYPE_BYTES); + // todo: avoid copy bytes for blob + datum.bytes_val = row.get_bytes(i); + datum + } + ArrowDataType::FixedSizeBinary(len) => { + let mut datum = new_datum(DATUM_TYPE_BYTES); + datum.bytes_val = row.get_binary(i, *len as usize); + datum + } + ArrowDataType::LargeBinary => { + let array = record_batch + .column(i) + .as_any() + .downcast_ref::() + .expect("LargeBinary column expected"); + let mut datum = new_datum(DATUM_TYPE_BYTES); + datum.bytes_val = array.value(row_id).to_vec(); + datum + } + ArrowDataType::Date32 => { + let array = record_batch + .column(i) + .as_any() + .downcast_ref::() + .expect("Date32 column expected"); + let mut datum = new_datum(DATUM_TYPE_INT32); + datum.i32_val = array.value(row_id); + datum + } + ArrowDataType::Timestamp(unit, _) => match unit { + TimeUnit::Second => { + let array = record_batch + .column(i) + .as_any() + .downcast_ref::() + .expect("Timestamp(second) column expected"); + let mut datum = new_datum(DATUM_TYPE_INT64); + datum.i64_val = array.value(row_id); + datum + } + TimeUnit::Millisecond => { + let array = record_batch + .column(i) + .as_any() + .downcast_ref::() + .expect("Timestamp(millisecond) column expected"); + let mut datum = new_datum(DATUM_TYPE_INT64); + datum.i64_val = array.value(row_id); + datum + } + TimeUnit::Microsecond => { + let array = record_batch + .column(i) + .as_any() + .downcast_ref::() + .expect("Timestamp(microsecond) column expected"); + let mut datum = new_datum(DATUM_TYPE_INT64); + datum.i64_val = array.value(row_id); + datum + } + TimeUnit::Nanosecond => { + let array = record_batch + .column(i) + .as_any() + .downcast_ref::() + .expect("Timestamp(nanosecond) column expected"); + let mut datum = new_datum(DATUM_TYPE_INT64); + datum.i64_val = array.value(row_id); + datum + } + }, + ArrowDataType::Time32(unit) => match unit { + TimeUnit::Second => { + let array = record_batch + .column(i) + .as_any() + .downcast_ref::() + .expect("Time32(second) column expected"); + let mut datum = new_datum(DATUM_TYPE_INT32); + datum.i32_val = array.value(row_id); + datum + } + TimeUnit::Millisecond => { + let array = record_batch + .column(i) + .as_any() + .downcast_ref::() + .expect("Time32(millisecond) column expected"); + let mut datum = new_datum(DATUM_TYPE_INT32); + datum.i32_val = array.value(row_id); + datum + } + _ => panic!( + "Will never come here. Unsupported Time32 unit for column {}", + i + ), + }, + ArrowDataType::Time64(unit) => match unit { + TimeUnit::Microsecond => { + let array = record_batch + .column(i) + .as_any() + .downcast_ref::() + .expect("Time64(microsecond) column expected"); + let mut datum = new_datum(DATUM_TYPE_INT64); + datum.i64_val = array.value(row_id); + datum + } + TimeUnit::Nanosecond => { + let array = record_batch + .column(i) + .as_any() + .downcast_ref::() + .expect("Time64(nanosecond) column expected"); + let mut datum = new_datum(DATUM_TYPE_INT64); + datum.i64_val = array.value(row_id); + datum + } + _ => panic!( + "Will never come here. Unsupported Time64 unit for column {}", + i + ), + }, + other => panic!( + "Will never come here. Unsupported Arrow data type for column {}: {:?}", + i, other + ), + }; + + fields.push(datum); + } + + fields +} + +pub fn core_lake_snapshot_to_ffi(snapshot: &fcore::metadata::LakeSnapshot) -> ffi::FfiLakeSnapshot { + let bucket_offsets: Vec = snapshot + .table_buckets_offset + .iter() + .map(|(bucket, offset)| ffi::FfiBucketOffset { + table_id: bucket.table_id(), + partition_id: bucket.partition_id().unwrap_or(-1), + bucket_id: bucket.bucket_id(), + offset: *offset, + }) + .collect(); + + ffi::FfiLakeSnapshot { + snapshot_id: snapshot.snapshot_id, + bucket_offsets, + } +} diff --git a/fluss-rust/crates/fluss/src/config.rs b/fluss-rust/crates/fluss/src/config.rs index 08574965f4..92f600e62a 100644 --- a/fluss-rust/crates/fluss/src/config.rs +++ b/fluss-rust/crates/fluss/src/config.rs @@ -18,7 +18,7 @@ use clap::Parser; use serde::{Deserialize, Serialize}; -#[derive(Parser, Debug, Clone, Deserialize, Serialize, Default)] +#[derive(Parser, Debug, Clone, Deserialize, Serialize)] #[command(author, version, about, long_about = None)] pub struct Config { #[arg(long)] @@ -37,3 +37,15 @@ pub struct Config { #[arg(long, default_value_t = 2 * 1024 * 1024)] pub writer_batch_size: i32, } + +impl Default for Config { + fn default() -> Self { + Self { + bootstrap_server: None, + request_max_size: 10 * 1024 * 1024, + writer_acks: String::from("all"), + writer_retries: i32::MAX, + writer_batch_size: 2 * 1024 * 1024, + } + } +} From 8e273a801b9f157102b64a0e1910a03b058673e1 Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Sat, 13 Dec 2025 15:17:44 +0800 Subject: [PATCH 030/287] chore: update readme to mark it as official rust client (#88) --- fluss-rust/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fluss-rust/README.md b/fluss-rust/README.md index 6caaebb154..ee9478c6a5 100644 --- a/fluss-rust/README.md +++ b/fluss-rust/README.md @@ -28,7 +28,7 @@ Rust implementation of [Apache Fluss™](https://fluss.apache.org/). It bridges the gap between streaming data and the data Lakehouse by enabling low-latency, high-throughput data ingestion and processing while seamlessly integrating with popular compute engines. ## Why Fluss Rust Client -It's an unofficial experimental Rust client for interacting with Fluss. This client provides foundational capabilities for table management and log streaming operations, enabling developers to explore Fluss within Rust ecosystems. +It's an official Rust client for interacting with Fluss. This client provides foundational capabilities for table management and log streaming operations, enabling developers to explore Fluss within Rust ecosystems. ## Quick-Start From bf42412bb92b89f690ae864dde04755fb6abe331 Mon Sep 17 00:00:00 2001 From: AlexZhao Date: Sat, 13 Dec 2025 17:10:03 +0800 Subject: [PATCH 031/287] chore: abort last not complete batch (#91) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: 赵海源 --- fluss-rust/crates/fluss/src/record/arrow.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index e46093dd14..806c9a5824 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -369,7 +369,11 @@ impl<'a> LogRecordsBatchs<'a> { let batch_size_bytes = LittleEndian::read_i32(self.data.get(self.current_pos + LENGTH_OFFSET..).unwrap()); - Some(batch_size_bytes as usize + LOG_OVERHEAD) + let batch_size = batch_size_bytes as usize + LOG_OVERHEAD; + if batch_size > self.remaining_bytes { + return None; + } + Some(batch_size) } } From d5586643db49eb07cbf35b0ef42f93c33ea7f6c1 Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Sat, 13 Dec 2025 17:11:28 +0800 Subject: [PATCH 032/287] feat: support subscribe from remote (#76) --- fluss-rust/bindings/cpp/src/lib.rs | 20 +- fluss-rust/bindings/cpp/src/types.rs | 15 +- fluss-rust/bindings/python/src/table.rs | 6 +- .../crates/examples/src/example_table.rs | 2 +- fluss-rust/crates/fluss/Cargo.toml | 17 +- .../crates/fluss/src/client/table/mod.rs | 1 + .../fluss/src/client/table/remote_log.rs | 267 ++++++++++++++++++ .../crates/fluss/src/client/table/scanner.rs | 90 +++++- fluss-rust/crates/fluss/src/error.rs | 12 + fluss-rust/crates/fluss/src/io/file_io.rs | 158 +++++++++++ fluss-rust/crates/fluss/src/io/mod.rs | 34 +++ fluss-rust/crates/fluss/src/io/storage.rs | 79 ++++++ fluss-rust/crates/fluss/src/io/storage_fs.rs | 30 ++ .../crates/fluss/src/io/storage_memory.rs | 24 ++ fluss-rust/crates/fluss/src/lib.rs | 1 + fluss-rust/crates/fluss/src/util/mod.rs | 8 + .../fluss/tests/integration/fluss_cluster.rs | 60 +++- .../crates/fluss/tests/integration/table.rs | 19 +- .../tests/integration/table_remote_scan.rs | 223 +++++++++++++++ fluss-rust/crates/fluss/tests/test_fluss.rs | 2 + 20 files changed, 1019 insertions(+), 49 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/client/table/remote_log.rs create mode 100644 fluss-rust/crates/fluss/src/io/file_io.rs create mode 100644 fluss-rust/crates/fluss/src/io/mod.rs create mode 100644 fluss-rust/crates/fluss/src/io/storage.rs create mode 100644 fluss-rust/crates/fluss/src/io/storage_fs.rs create mode 100644 fluss-rust/crates/fluss/src/io/storage_memory.rs create mode 100644 fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs diff --git a/fluss-rust/bindings/cpp/src/lib.rs b/fluss-rust/bindings/cpp/src/lib.rs index 3e883e29c6..54d6941379 100644 --- a/fluss-rust/bindings/cpp/src/lib.rs +++ b/fluss-rust/bindings/cpp/src/lib.rs @@ -243,7 +243,7 @@ fn new_connection(bootstrap_server: &str) -> Result<*mut Connection, String> { })); Ok(conn) } - Err(e) => Err(format!("Failed to connect: {}", e)), + Err(e) => Err(format!("Failed to connect: {e}")), } } @@ -264,7 +264,7 @@ impl Connection { let admin = Box::into_raw(Box::new(Admin { inner: admin })); Ok(admin) } - Err(e) => Err(format!("Failed to get admin: {}", e)), + Err(e) => Err(format!("Failed to get admin: {e}")), } } @@ -287,7 +287,7 @@ impl Connection { })); Ok(table) } - Err(e) => Err(format!("Failed to get table: {}", e)), + Err(e) => Err(format!("Failed to get table: {e}")), } } } @@ -398,7 +398,7 @@ impl Table { let table_append = match fluss_table.new_append() { Ok(a) => a, - Err(e) => return Err(format!("Failed to create append: {}", e)), + Err(e) => return Err(format!("Failed to create append: {e}")), }; let writer = table_append.create_writer(); @@ -413,7 +413,10 @@ impl Table { self.table_info.clone(), ); - let scanner = fluss_table.new_scan().create_log_scanner(); + let scanner = match fluss_table.new_scan().create_log_scanner() { + Ok(a) => a, + Err(e) => return Err(format!("Failed to create log scanner: {e}")), + }; let scanner = Box::into_raw(Box::new(LogScanner { inner: scanner })); Ok(scanner) } @@ -431,9 +434,12 @@ impl Table { let scan = fluss_table.new_scan(); let scan = match scan.project(&column_indices) { Ok(s) => s, - Err(e) => return Err(format!("Failed to project columns: {}", e)), + Err(e) => return Err(format!("Failed to project columns: {e}")), + }; + let scanner = match scan.create_log_scanner() { + Ok(a) => a, + Err(e) => return Err(format!("Failed to create log scanner: {e}")), }; - let scanner = scan.create_log_scanner(); let scanner = Box::into_raw(Box::new(LogScanner { inner: scanner })); Ok(scanner) } diff --git a/fluss-rust/bindings/cpp/src/types.rs b/fluss-rust/bindings/cpp/src/types.rs index d3bab38abe..f9404ac633 100644 --- a/fluss-rust/bindings/cpp/src/types.rs +++ b/fluss-rust/bindings/cpp/src/types.rs @@ -64,7 +64,7 @@ fn ffi_data_type_to_core(dt: i32) -> Result { DATA_TYPE_TIME => Ok(fcore::metadata::DataTypes::time()), DATA_TYPE_TIMESTAMP => Ok(fcore::metadata::DataTypes::timestamp()), DATA_TYPE_TIMESTAMP_LTZ => Ok(fcore::metadata::DataTypes::timestamp_ltz()), - _ => Err(anyhow!("Unknown data type: {}", dt)), + _ => Err(anyhow!("Unknown data type: {dt}")), } } @@ -423,10 +423,7 @@ fn core_row_to_ffi_fields(row: &fcore::row::ColumnarRow) -> Vec { datum.i32_val = array.value(row_id); datum } - _ => panic!( - "Will never come here. Unsupported Time32 unit for column {}", - i - ), + _ => panic!("Will never come here. Unsupported Time32 unit for column {i}"), }, ArrowDataType::Time64(unit) => match unit { TimeUnit::Microsecond => { @@ -449,14 +446,10 @@ fn core_row_to_ffi_fields(row: &fcore::row::ColumnarRow) -> Vec { datum.i64_val = array.value(row_id); datum } - _ => panic!( - "Will never come here. Unsupported Time64 unit for column {}", - i - ), + _ => panic!("Will never come here. Unsupported Time64 unit for column {i}"), }, other => panic!( - "Will never come here. Unsupported Arrow data type for column {}: {:?}", - i, other + "Will never come here. Unsupported Arrow data type for column {i}: {other:?}" ), }; diff --git a/fluss-rust/bindings/python/src/table.rs b/fluss-rust/bindings/python/src/table.rs index 2a8df25c9d..71759d7505 100644 --- a/fluss-rust/bindings/python/src/table.rs +++ b/fluss-rust/bindings/python/src/table.rs @@ -67,7 +67,11 @@ impl FlussTable { let table_scan = fluss_table.new_scan(); - let rust_scanner = table_scan.create_log_scanner(); + let rust_scanner = table_scan.create_log_scanner().map_err(|e| { + PyErr::new::(format!( + "Failed to create log scanner: {e:?}" + )) + })?; let admin = conn .get_admin() diff --git a/fluss-rust/crates/examples/src/example_table.rs b/fluss-rust/crates/examples/src/example_table.rs index deab3639da..2d6ac53d8f 100644 --- a/fluss-rust/crates/examples/src/example_table.rs +++ b/fluss-rust/crates/examples/src/example_table.rs @@ -70,7 +70,7 @@ pub async fn main() -> Result<()> { try_join!(f1, f2, append_writer.flush())?; // scan rows - let log_scanner = table.new_scan().create_log_scanner(); + let log_scanner = table.new_scan().create_log_scanner()?; log_scanner.subscribe(0, 0).await?; loop { diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index af770377ce..4547b9c327 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -22,6 +22,14 @@ version = { workspace = true } name = "fluss" build = "src/build.rs" +[features] +default = ["storage-memory", "storage-fs"] +storage-all = ["storage-memory", "storage-fs"] + +storage-memory = ["opendal/services-memory"] +storage-fs = ["opendal/services-fs"] +integration_tests = [] + [dependencies] arrow = { workspace = true } arrow-schema = "57.0.0" @@ -45,16 +53,17 @@ ordered-float = { version = "4", features = ["serde"] } parse-display = "0.10" ref-cast = "1.0" chrono = { workspace = true } -oneshot = "0.1.11" +opendal = "0.53.3" +url = "2.5.7" +async-trait = "0.1.89" +uuid = { version = "1.10", features = ["v4"] } +tempfile= "3.23.0" [dev-dependencies] testcontainers = "0.25.0" once_cell = "1.19" test-env-helpers = "0.2.2" -[features] -integration_tests = [] - [build-dependencies] prost-build = { version = "0.13.5" } diff --git a/fluss-rust/crates/fluss/src/client/table/mod.rs b/fluss-rust/crates/fluss/src/client/table/mod.rs index 52ae700fc6..99722477c5 100644 --- a/fluss-rust/crates/fluss/src/client/table/mod.rs +++ b/fluss-rust/crates/fluss/src/client/table/mod.rs @@ -26,6 +26,7 @@ pub const EARLIEST_OFFSET: i64 = -2; mod append; +mod remote_log; mod scanner; mod writer; diff --git a/fluss-rust/crates/fluss/src/client/table/remote_log.rs b/fluss-rust/crates/fluss/src/client/table/remote_log.rs new file mode 100644 index 0000000000..65805d069b --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/remote_log.rs @@ -0,0 +1,267 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::error::{Error, Result}; +use crate::io::{FileIO, Storage}; +use crate::metadata::TableBucket; +use crate::proto::{PbRemoteLogFetchInfo, PbRemoteLogSegment}; +use crate::record::{LogRecordsBatchs, ReadContext, ScanRecord}; +use crate::util::delete_file; +use std::collections::HashMap; +use std::io; +use std::path::{Path, PathBuf}; +use tempfile::TempDir; +use tokio::io::AsyncWriteExt; +use tokio::sync::oneshot; + +/// Represents a remote log segment that needs to be downloaded +#[derive(Debug, Clone)] +pub struct RemoteLogSegment { + pub segment_id: String, + pub start_offset: i64, + #[allow(dead_code)] + pub end_offset: i64, + #[allow(dead_code)] + pub size_in_bytes: i32, + pub table_bucket: TableBucket, +} + +impl RemoteLogSegment { + pub fn from_proto(segment: &PbRemoteLogSegment, table_bucket: TableBucket) -> Self { + Self { + segment_id: segment.remote_log_segment_id.clone(), + start_offset: segment.remote_log_start_offset, + end_offset: segment.remote_log_end_offset, + size_in_bytes: segment.segment_size_in_bytes, + table_bucket, + } + } + + /// Get the local file name for this remote log segment + pub fn local_file_name(&self) -> String { + // Format: ${remote_segment_id}_${offset_prefix}.log + let offset_prefix = format!("{:020}", self.start_offset); + format!("{}_{}.log", self.segment_id, offset_prefix) + } +} + +/// Represents remote log fetch information +#[derive(Debug, Clone)] +pub struct RemoteLogFetchInfo { + pub remote_log_tablet_dir: String, + #[allow(dead_code)] + pub partition_name: Option, + pub remote_log_segments: Vec, + pub first_start_pos: i32, +} + +impl RemoteLogFetchInfo { + pub fn from_proto(info: &PbRemoteLogFetchInfo, table_bucket: TableBucket) -> Result { + let segments = info + .remote_log_segments + .iter() + .map(|s| RemoteLogSegment::from_proto(s, table_bucket.clone())) + .collect(); + + Ok(Self { + remote_log_tablet_dir: info.remote_log_tablet_dir.clone(), + partition_name: info.partition_name.clone(), + remote_log_segments: segments, + first_start_pos: info.first_start_pos.unwrap_or(0), + }) + } +} + +/// Future for a remote log download request +pub struct RemoteLogDownloadFuture { + receiver: Option>>, +} + +impl RemoteLogDownloadFuture { + pub fn new(receiver: oneshot::Receiver>) -> Self { + Self { + receiver: Some(receiver), + } + } + + /// Get the downloaded file path + pub async fn get_file_path(&mut self) -> Result { + let receiver = self + .receiver + .take() + .ok_or_else(|| Error::Io(io::Error::other("Download future already consumed")))?; + + receiver.await.map_err(|e| { + Error::Io(io::Error::other(format!( + "Download future cancelled: {e:?}" + ))) + })? + } +} + +/// Downloader for remote log segment files +pub struct RemoteLogDownloader { + local_log_dir: TempDir, +} + +impl RemoteLogDownloader { + pub fn new(local_log_dir: TempDir) -> Result { + Ok(Self { local_log_dir }) + } + + /// Request to fetch a remote log segment to local. This method is non-blocking. + pub fn request_remote_log( + &self, + remote_log_tablet_dir: &str, + segment: &RemoteLogSegment, + ) -> Result { + let (sender, receiver) = oneshot::channel(); + let local_file_name = segment.local_file_name(); + let local_file_path = self.local_log_dir.path().join(&local_file_name); + let remote_path = self.build_remote_path(remote_log_tablet_dir, segment); + let remote_log_tablet_dir = remote_log_tablet_dir.to_string(); + // Spawn async download task + tokio::spawn(async move { + let result = + Self::download_file(&remote_log_tablet_dir, &remote_path, &local_file_path).await; + let _ = sender.send(result); + }); + Ok(RemoteLogDownloadFuture::new(receiver)) + } + + /// Build the remote path for a log segment + fn build_remote_path(&self, remote_log_tablet_dir: &str, segment: &RemoteLogSegment) -> String { + // Format: ${remote_log_tablet_dir}/${segment_id}/${offset_prefix}.log + let offset_prefix = format!("{:020}", segment.start_offset); + format!( + "{}/{}/{}.log", + remote_log_tablet_dir, segment.segment_id, offset_prefix + ) + } + + /// Download a file from remote storage to local using streaming read/write + async fn download_file( + remote_log_tablet_dir: &str, + remote_path: &str, + local_path: &Path, + ) -> Result { + // Handle both URL (e.g., "s3://bucket/path") and local file paths + // If the path doesn't contain "://", treat it as a local file path + let remote_log_tablet_dir_url = if remote_log_tablet_dir.contains("://") { + remote_log_tablet_dir.to_string() + } else { + format!("file://{remote_log_tablet_dir}") + }; + + // Create FileIO from the remote log tablet dir URL to get the storage + let file_io_builder = FileIO::from_url(&remote_log_tablet_dir_url)?; + + // Build storage and create operator directly + let storage = Storage::build(file_io_builder)?; + let (op, relative_path) = storage.create(remote_path)?; + + // Get file metadata to know the size + let meta = op.stat(relative_path).await?; + let file_size = meta.content_length(); + + // Create local file for writing + let mut local_file = tokio::fs::File::create(local_path).await?; + + // Stream data from remote to local file in chunks + // opendal::Reader::read accepts a range, so we read in chunks + const CHUNK_SIZE: u64 = 8 * 1024 * 1024; // 8MB chunks for efficient reading + let mut offset = 0u64; + + while offset < file_size { + let end = std::cmp::min(offset + CHUNK_SIZE, file_size); + let range = offset..end; + + // Read chunk from remote storage + let chunk = op.read_with(relative_path).range(range.clone()).await?; + let bytes = chunk.to_bytes(); + + // Write chunk to local file + local_file.write_all(&bytes).await?; + + offset = end; + } + + // Ensure all data is flushed to disk + local_file.sync_all().await?; + + Ok(local_path.to_path_buf()) + } +} + +/// Pending fetch that waits for remote log file to be downloaded +pub struct RemotePendingFetch { + segment: RemoteLogSegment, + download_future: RemoteLogDownloadFuture, + pos_in_log_segment: i32, + #[allow(dead_code)] + fetch_offset: i64, + #[allow(dead_code)] + high_watermark: i64, + read_context: ReadContext, +} + +impl RemotePendingFetch { + pub fn new( + segment: RemoteLogSegment, + download_future: RemoteLogDownloadFuture, + pos_in_log_segment: i32, + fetch_offset: i64, + high_watermark: i64, + read_context: ReadContext, + ) -> Self { + Self { + segment, + download_future, + pos_in_log_segment, + fetch_offset, + high_watermark, + read_context, + } + } + + /// Convert to completed fetch by reading the downloaded file + pub async fn convert_to_completed_fetch( + mut self, + ) -> Result>> { + let file_path = self.download_future.get_file_path().await?; + let file_data = tokio::fs::read(&file_path).await?; + + // Slice the data if needed + let data = if self.pos_in_log_segment > 0 { + &file_data[self.pos_in_log_segment as usize..] + } else { + &file_data + }; + + // delete the downloaded local file to free disk + delete_file(file_path).await; + + // Parse log records + let mut fetch_records = vec![]; + for log_record in &mut LogRecordsBatchs::new(data) { + fetch_records.extend(log_record.records(&self.read_context)?); + } + + let mut result = HashMap::new(); + result.insert(self.segment.table_bucket.clone(), fetch_records); + Ok(result) + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index 13372efecf..f6780d715c 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -29,6 +29,11 @@ use std::collections::HashMap; use std::slice::from_ref; use std::sync::Arc; use std::time::Duration; +use tempfile::TempDir; + +use crate::client::table::remote_log::{ + RemoteLogDownloader, RemoteLogFetchInfo, RemotePendingFetch, +}; const LOG_FETCH_MAX_BYTES: i32 = 16 * 1024 * 1024; #[allow(dead_code)] @@ -120,7 +125,7 @@ impl<'a> TableScan<'a> { Ok(self) } - pub fn create_log_scanner(self) -> LogScanner { + pub fn create_log_scanner(self) -> Result { LogScanner::new( &self.table_info, self.metadata.clone(), @@ -144,9 +149,9 @@ impl LogScanner { metadata: Arc, connections: Arc, projected_fields: Option>, - ) -> Self { + ) -> Result { let log_scanner_status = Arc::new(LogScannerStatus::new()); - Self { + Ok(Self { table_path: table_info.table_path.clone(), table_id: table_info.table_id, metadata: metadata.clone(), @@ -157,8 +162,8 @@ impl LogScanner { metadata.clone(), log_scanner_status.clone(), projected_fields, - ), - } + )?, + }) } pub async fn poll(&self, _timeout: Duration) -> Result { @@ -188,6 +193,7 @@ struct LogFetcher { metadata: Arc, log_scanner_status: Arc, read_context: ReadContext, + remote_log_downloader: RemoteLogDownloader, } impl LogFetcher { @@ -197,17 +203,21 @@ impl LogFetcher { metadata: Arc, log_scanner_status: Arc, projected_fields: Option>, - ) -> Self { + ) -> Result { let full_arrow_schema = to_arrow_schema(table_info.get_row_type()); - let read_context = Self::create_read_context(full_arrow_schema, projected_fields); - LogFetcher { + let read_context = Self::create_read_context(full_arrow_schema, projected_fields.clone()); + + let tmp_dir = TempDir::with_prefix("fluss-remote-logs")?; + + Ok(LogFetcher { table_path: table_info.table_path.clone(), conns, table_info, metadata, log_scanner_status, read_context, - } + remote_log_downloader: RemoteLogDownloader::new(tmp_dir)?, + }) } fn create_read_context( @@ -239,10 +249,66 @@ impl LogFetcher { let fetch_log_for_buckets = pb_fetch_log_resp.buckets_resp; for fetch_log_for_bucket in fetch_log_for_buckets { - let mut fetch_records = vec![]; let bucket: i32 = fetch_log_for_bucket.bucket_id; let table_bucket = TableBucket::new(table_id, bucket); - if fetch_log_for_bucket.records.is_some() { + + // Check if this is a remote log fetch + if let Some(ref remote_log_fetch_info) = + fetch_log_for_bucket.remote_log_fetch_info + { + let remote_fetch_info = RemoteLogFetchInfo::from_proto( + remote_log_fetch_info, + table_bucket.clone(), + )?; + + if let Some(fetch_offset) = + self.log_scanner_status.get_bucket_offset(&table_bucket) + { + let high_watermark = fetch_log_for_bucket.high_watermark.unwrap_or(-1); + // Download and process remote log segments + let mut pos_in_log_segment = remote_fetch_info.first_start_pos; + let mut current_fetch_offset = fetch_offset; + // todo: make segment download parallelly + for (i, segment) in + remote_fetch_info.remote_log_segments.iter().enumerate() + { + if i > 0 { + pos_in_log_segment = 0; + current_fetch_offset = segment.start_offset; + } + + let download_future = + self.remote_log_downloader.request_remote_log( + &remote_fetch_info.remote_log_tablet_dir, + segment, + )?; + let pending_fetch = RemotePendingFetch::new( + segment.clone(), + download_future, + pos_in_log_segment, + current_fetch_offset, + high_watermark, + self.read_context.clone(), + ); + let remote_records = + pending_fetch.convert_to_completed_fetch().await?; + // Update offset and merge results + for (tb, records) in remote_records { + if let Some(last_record) = records.last() { + self.log_scanner_status + .update_offset(&tb, last_record.offset() + 1); + } + result.entry(tb).or_default().extend(records); + } + } + } else { + // if the offset is null, it means the bucket has been unsubscribed, + // skip processing and continue to the next bucket. + continue; + } + } else if fetch_log_for_bucket.records.is_some() { + // Handle regular in-memory records + let mut fetch_records = vec![]; let data = fetch_log_for_bucket.records.unwrap(); for log_record in &mut LogRecordsBatchs::new(&data) { let last_offset = log_record.last_log_offset(); @@ -250,8 +316,8 @@ impl LogFetcher { self.log_scanner_status .update_offset(&table_bucket, last_offset + 1); } + result.insert(table_bucket, fetch_records); } - result.insert(table_bucket, fetch_records); } } } diff --git a/fluss-rust/crates/fluss/src/error.rs b/fluss-rust/crates/fluss/src/error.rs index 58b88a4786..b1d5d13b6e 100644 --- a/fluss-rust/crates/fluss/src/error.rs +++ b/fluss-rust/crates/fluss/src/error.rs @@ -47,4 +47,16 @@ pub enum Error { #[error("Illegal argument error: {0}")] IllegalArgument(String), + + #[error("IO not supported error: {0}")] + IoUnsupported(String), + + #[error("IO operation failed on underlying storage: {0}")] + IoUnexpected(Box), +} + +impl From for Error { + fn from(err: opendal::Error) -> Self { + Error::IoUnexpected(Box::new(err)) + } } diff --git a/fluss-rust/crates/fluss/src/io/file_io.rs b/fluss-rust/crates/fluss/src/io/file_io.rs new file mode 100644 index 0000000000..69a4c9707f --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/file_io.rs @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use crate::error::*; +use std::collections::HashMap; +use std::ops::Range; +use std::sync::Arc; + +use bytes::Bytes; +use chrono::{DateTime, Utc}; +use opendal::Operator; + +use url::Url; + +use super::Storage; + +use crate::error::Result; + +#[derive(Clone, Debug)] +pub struct FileIO { + storage: Arc, +} + +impl FileIO { + /// Try to infer file io scheme from path. + pub fn from_url(path: &str) -> Result { + let url = + Url::parse(path).map_err(|_| Error::IllegalArgument(format!("Invalid URL: {path}")))?; + Ok(FileIOBuilder::new(url.scheme())) + } + + /// Create a new input file to read data. + pub fn new_input(&self, path: &str) -> Result { + let (op, relative_path) = self.storage.create(path)?; + let path = path.to_string(); + let relative_path_pos = path.len() - relative_path.len(); + Ok(InputFile { + op, + path, + relative_path_pos, + }) + } +} + +#[derive(Debug)] +pub struct FileIOBuilder { + scheme_str: Option, + props: HashMap, +} + +impl FileIOBuilder { + pub fn new(scheme_str: impl ToString) -> Self { + Self { + scheme_str: Some(scheme_str.to_string()), + props: HashMap::default(), + } + } + + pub(crate) fn into_parts(self) -> (String, HashMap) { + (self.scheme_str.unwrap_or_default(), self.props) + } + + pub fn with_prop(mut self, key: impl ToString, value: impl ToString) -> Self { + self.props.insert(key.to_string(), value.to_string()); + self + } + + pub fn with_props( + mut self, + args: impl IntoIterator, + ) -> Self { + self.props + .extend(args.into_iter().map(|e| (e.0.to_string(), e.1.to_string()))); + self + } + + pub fn build(self) -> Result { + let storage = Storage::build(self)?; + Ok(FileIO { + storage: Arc::new(storage), + }) + } +} + +#[async_trait::async_trait] +pub trait FileRead: Send + Unpin + 'static { + async fn read(&self, range: Range) -> Result; +} + +#[async_trait::async_trait] +impl FileRead for opendal::Reader { + async fn read(&self, range: Range) -> Result { + Ok(opendal::Reader::read(self, range).await?.to_bytes()) + } +} + +#[derive(Debug)] +pub struct InputFile { + op: Operator, + path: String, + relative_path_pos: usize, +} + +impl InputFile { + pub fn location(&self) -> &str { + &self.path + } + + pub async fn exists(&self) -> Result { + Ok(self.op.exists(&self.path[self.relative_path_pos..]).await?) + } + + pub async fn metadata(&self) -> Result { + let meta = self.op.stat(&self.path[self.relative_path_pos..]).await?; + + Ok(FileStatus { + size: meta.content_length(), + is_dir: meta.is_dir(), + path: self.path.clone(), + last_modified: meta.last_modified(), + }) + } + + pub async fn read(&self) -> Result { + Ok(self + .op + .read(&self.path[self.relative_path_pos..]) + .await? + .to_bytes()) + } + + pub async fn reader(&self) -> Result { + Ok(self.op.reader(&self.path[self.relative_path_pos..]).await?) + } +} + +#[derive(Clone, Debug)] +pub struct FileStatus { + pub size: u64, + pub is_dir: bool, + pub path: String, + pub last_modified: Option>, +} diff --git a/fluss-rust/crates/fluss/src/io/mod.rs b/fluss-rust/crates/fluss/src/io/mod.rs new file mode 100644 index 0000000000..3c9a1659e4 --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/mod.rs @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +mod file_io; + +pub use file_io::*; + +mod storage; +pub use storage::*; + +#[cfg(feature = "storage-fs")] +mod storage_fs; +#[cfg(feature = "storage-fs")] +use storage_fs::*; +#[cfg(feature = "storage-memory")] +mod storage_memory; + +#[cfg(feature = "storage-memory")] +use storage_memory::*; diff --git a/fluss-rust/crates/fluss/src/io/storage.rs b/fluss-rust/crates/fluss/src/io/storage.rs new file mode 100644 index 0000000000..361da7ee80 --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/storage.rs @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +use crate::error; +use crate::error::Result; +use crate::io::FileIOBuilder; +use opendal::{Operator, Scheme}; + +/// The storage carries all supported storage services in fluss +#[derive(Debug)] +pub enum Storage { + #[cfg(feature = "storage-memory")] + Memory, + #[cfg(feature = "storage-fs")] + LocalFs, +} + +impl Storage { + pub(crate) fn build(file_io_builder: FileIOBuilder) -> Result { + let (scheme_str, _) = file_io_builder.into_parts(); + let scheme = Self::parse_scheme(&scheme_str)?; + + match scheme { + #[cfg(feature = "storage-memory")] + Scheme::Memory => Ok(Self::Memory), + #[cfg(feature = "storage-fs")] + Scheme::Fs => Ok(Self::LocalFs), + _ => Err(error::Error::IoUnsupported( + "Unsupported storage feature".to_string(), + )), + } + } + + pub(crate) fn create<'a>(&self, path: &'a str) -> Result<(Operator, &'a str)> { + match self { + #[cfg(feature = "storage-memory")] + Storage::Memory => { + let op = super::memory_config_build()?; + + if let Some(stripped) = path.strip_prefix("memory:/") { + Ok((op, stripped)) + } else { + Ok((op, &path[1..])) + } + } + #[cfg(feature = "storage-fs")] + Storage::LocalFs => { + let op = super::fs_config_build()?; + if let Some(stripped) = path.strip_prefix("file:/") { + Ok((op, stripped)) + } else { + Ok((op, &path[1..])) + } + } + } + } + + fn parse_scheme(scheme: &str) -> Result { + match scheme { + "memory" => Ok(Scheme::Memory), + "file" | "" => Ok(Scheme::Fs), + s => Ok(s.parse::()?), + } + } +} diff --git a/fluss-rust/crates/fluss/src/io/storage_fs.rs b/fluss-rust/crates/fluss/src/io/storage_fs.rs new file mode 100644 index 0000000000..95ca6fa95f --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/storage_fs.rs @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use opendal::Operator; +use opendal::services::FsConfig; + +use crate::error::Result; + +/// Build new opendal operator from give path. +pub(crate) fn fs_config_build() -> Result { + let mut cfg = FsConfig::default(); + cfg.root = Some("/".to_string()); + + Ok(Operator::from_config(cfg)?.finish()) +} diff --git a/fluss-rust/crates/fluss/src/io/storage_memory.rs b/fluss-rust/crates/fluss/src/io/storage_memory.rs new file mode 100644 index 0000000000..af73a90174 --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/storage_memory.rs @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +use crate::error::Result; +use opendal::Operator; +use opendal::services::MemoryConfig; + +pub(crate) fn memory_config_build() -> Result { + Ok(Operator::from_config(MemoryConfig::default())?.finish()) +} diff --git a/fluss-rust/crates/fluss/src/lib.rs b/fluss-rust/crates/fluss/src/lib.rs index e63b5edfd8..366edfc60a 100644 --- a/fluss-rust/crates/fluss/src/lib.rs +++ b/fluss-rust/crates/fluss/src/lib.rs @@ -26,6 +26,7 @@ mod cluster; pub mod config; pub mod error; +pub mod io; mod util; pub type TableId = u64; diff --git a/fluss-rust/crates/fluss/src/util/mod.rs b/fluss-rust/crates/fluss/src/util/mod.rs index c26b4ec43a..f93abf967b 100644 --- a/fluss-rust/crates/fluss/src/util/mod.rs +++ b/fluss-rust/crates/fluss/src/util/mod.rs @@ -19,8 +19,10 @@ use crate::metadata::TableBucket; use linked_hash_map::LinkedHashMap; use std::collections::{HashMap, HashSet}; use std::hash::Hash; +use std::path::PathBuf; use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; +use tracing::warn; pub fn current_time_ms() -> i64 { SystemTime::now() @@ -29,6 +31,12 @@ pub fn current_time_ms() -> i64 { .as_millis() as i64 } +pub async fn delete_file(file_path: PathBuf) { + tokio::fs::remove_file(&file_path) + .await + .unwrap_or_else(|e| warn!("Could not delete file: {:?}, error: {:?}", &file_path, e)); +} + pub struct FairBucketStatusMap { map: LinkedHashMap>, size: usize, diff --git a/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs b/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs index e827e14932..21422df6a0 100644 --- a/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs +++ b/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs @@ -32,12 +32,27 @@ pub struct FlussTestingClusterBuilder { network: &'static str, cluster_conf: HashMap, testing_name: String, + remote_data_dir: Option, } impl FlussTestingClusterBuilder { pub fn new(testing_name: impl Into) -> Self { + Self::new_with_cluster_conf(testing_name.into(), &HashMap::default()) + } + + pub fn with_remote_data_dir(mut self, dir: std::path::PathBuf) -> Self { + // Ensure the directory exists before mounting + std::fs::create_dir_all(&dir).expect("Failed to create remote data directory"); + self.remote_data_dir = Some(dir); + self + } + + pub fn new_with_cluster_conf( + testing_name: impl Into, + conf: &HashMap, + ) -> Self { // reduce testing resources - let mut cluster_conf = HashMap::new(); + let mut cluster_conf = conf.clone(); cluster_conf.insert( "netty.server.num-network-threads".to_string(), "1".to_string(), @@ -52,6 +67,7 @@ impl FlussTestingClusterBuilder { cluster_conf, network: "fluss-cluster-network", testing_name: testing_name.into(), + remote_data_dir: None, } } @@ -92,6 +108,7 @@ impl FlussTestingClusterBuilder { coordinator_server, tablet_servers, bootstrap_servers: "127.0.0.1:9123".to_string(), + remote_data_dir: self.remote_data_dir.clone(), } } @@ -147,7 +164,15 @@ impl FlussTestingClusterBuilder { tablet_server_confs.insert("internal.listener.name", "INTERNAL".to_string()); tablet_server_confs.insert("tablet-server.id", tablet_server_id); - GenericImage::new("fluss/fluss", FLUSS_VERSION) + // Set remote.data.dir to use the same path as host when volume mount is provided + // This ensures the path is consistent between host and container + if let Some(remote_data_dir) = &self.remote_data_dir { + tablet_server_confs.insert( + "remote.data.dir", + remote_data_dir.to_string_lossy().to_string(), + ); + } + let mut image = GenericImage::new("fluss/fluss", FLUSS_VERSION) .with_cmd(vec!["tabletServer"]) .with_mapped_port(expose_host_port as u16, ContainerPort::Tcp(9123)) .with_network(self.network) @@ -155,10 +180,20 @@ impl FlussTestingClusterBuilder { .with_env_var( "FLUSS_PROPERTIES", self.to_fluss_properties_with(tablet_server_confs), - ) - .start() - .await - .unwrap() + ); + + // Add volume mount if remote_data_dir is provided + if let Some(ref remote_data_dir) = self.remote_data_dir { + use testcontainers::core::Mount; + // Ensure directory exists before mounting (double check) + std::fs::create_dir_all(remote_data_dir) + .expect("Failed to create remote data directory for mount"); + let host_path = remote_data_dir.to_string_lossy().to_string(); + let container_path = remote_data_dir.to_string_lossy().to_string(); + image = image.with_mount(Mount::bind_mount(host_path, container_path)); + } + + image.start().await.unwrap() } fn to_fluss_properties_with(&self, extra_properties: HashMap<&str, String>) -> String { @@ -180,6 +215,7 @@ pub struct FlussTestingCluster { coordinator_server: Arc>, tablet_servers: HashMap>>, bootstrap_servers: String, + remote_data_dir: Option, } impl FlussTestingCluster { @@ -189,6 +225,18 @@ impl FlussTestingCluster { } self.coordinator_server.stop().await.unwrap(); self.zookeeper.stop().await.unwrap(); + if let Some(remote_data_dir) = &self.remote_data_dir { + // Try to clean up the remote data directory, but don't fail if it can't be deleted. + // This can happen in CI environments or if Docker containers are still using the directory. + // The directory will be cleaned up by the CI system or OS eventually. + if let Err(e) = tokio::fs::remove_dir_all(remote_data_dir).await { + eprintln!( + "Warning: Failed to delete remote data directory: {:?}, error: {:?}. \ + This is non-fatal and the directory may be cleaned up later.", + remote_data_dir, e + ); + } + } } pub async fn get_fluss_connection(&self) -> FlussConnection { diff --git a/fluss-rust/crates/fluss/tests/integration/table.rs b/fluss-rust/crates/fluss/tests/integration/table.rs index e14b852648..b23fd793cd 100644 --- a/fluss-rust/crates/fluss/tests/integration/table.rs +++ b/fluss-rust/crates/fluss/tests/integration/table.rs @@ -16,11 +16,11 @@ * limitations under the License. */ -use crate::integration::fluss_cluster::FlussTestingCluster; use once_cell::sync::Lazy; use parking_lot::RwLock; use std::sync::Arc; +use crate::integration::fluss_cluster::FlussTestingCluster; #[cfg(test)] use test_env_helpers::*; @@ -39,12 +39,11 @@ mod table_test { use fluss::metadata::{DataTypes, Schema, TableBucket, TableDescriptor, TablePath}; use fluss::row::InternalRow; use std::sync::Arc; - use std::sync::atomic::AtomicUsize; use std::thread; fn before_all() { // Create a new tokio runtime in a separate thread let cluster_guard = SHARED_FLUSS_CLUSTER.clone(); - std::thread::spawn(move || { + thread::spawn(move || { let rt = tokio::runtime::Runtime::new().expect("Failed to create runtime"); rt.block_on(async { let cluster = FlussTestingClusterBuilder::new("test_table").build().await; @@ -71,7 +70,7 @@ mod table_test { fn after_all() { // Create a new tokio runtime in a separate thread let cluster_guard = SHARED_FLUSS_CLUSTER.clone(); - std::thread::spawn(move || { + thread::spawn(move || { let rt = tokio::runtime::Runtime::new().expect("Failed to create runtime"); rt.block_on(async { let mut guard = cluster_guard.write(); @@ -137,7 +136,10 @@ mod table_test { append_writer.flush().await.expect("Failed to flush"); let num_buckets = table.table_info().get_num_buckets(); - let log_scanner = table.new_scan().create_log_scanner(); + let log_scanner = table + .new_scan() + .create_log_scanner() + .expect("Failed to create log scanner"); for bucket_id in 0..num_buckets { log_scanner .subscribe(bucket_id, 0) @@ -166,7 +168,8 @@ mod table_test { .new_scan() .project(&[1, 0]) .expect("Failed to project") - .create_log_scanner(); + .create_log_scanner() + .expect("Failed to create log scanner"); for bucket_id in 0..num_buckets { log_scanner_projected .subscribe(bucket_id, 0) @@ -212,7 +215,9 @@ mod table_test { .expect("Failed to get table"); let table_scan = table.new_scan(); - let log_scanner = table_scan.create_log_scanner(); + let log_scanner = table_scan + .create_log_scanner() + .expect("Failed to create log scanner"); // Subscribe to bucket 0 starting from offset 0 log_scanner diff --git a/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs b/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs new file mode 100644 index 0000000000..f33d440f40 --- /dev/null +++ b/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +use crate::integration::fluss_cluster::FlussTestingCluster; +use once_cell::sync::Lazy; +use parking_lot::RwLock; +use std::sync::Arc; + +#[cfg(test)] +use test_env_helpers::*; + +// Module-level shared cluster instance (only for this test file) +static SHARED_FLUSS_CLUSTER: Lazy>>> = + Lazy::new(|| Arc::new(RwLock::new(None))); + +#[cfg(test)] +#[before_all] +#[after_all] +mod table_remote_scan_test { + use super::SHARED_FLUSS_CLUSTER; + use crate::integration::fluss_cluster::{FlussTestingCluster, FlussTestingClusterBuilder}; + use crate::integration::utils::create_table; + use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; + use fluss::row::{GenericRow, InternalRow}; + use std::collections::HashMap; + use std::sync::Arc; + use std::sync::atomic::AtomicUsize; + use std::sync::atomic::Ordering; + use std::thread; + use std::thread::sleep; + use std::time::Duration; + use uuid::Uuid; + + fn before_all() { + // Create a new tokio runtime in a separate thread + let cluster_guard = SHARED_FLUSS_CLUSTER.clone(); + thread::spawn(move || { + let rt = tokio::runtime::Runtime::new().expect("Failed to create runtime"); + rt.block_on(async { + // Create a temporary directory for remote data that can be accessed from both + // container and host. Use a fixed path so it's the same in container and host. + // On macOS, Docker Desktop may have issues with /tmp, so we use a path in the + // current working directory or user's home directory which Docker can access. + let temp_dir = std::env::current_dir() + .unwrap_or_else(|_| std::path::PathBuf::from(".")) + .join("target") + .join(format!("test-remote-data-{}", Uuid::new_v4())); + + // Remove existing directory if it exists to start fresh + let _ = std::fs::remove_dir_all(&temp_dir); + std::fs::create_dir_all(&temp_dir) + .expect("Failed to create temporary directory for remote data"); + println!("temp_dir: {:?}", temp_dir); + + // Verify directory was created and is accessible + if !temp_dir.exists() { + panic!("Remote data directory was not created: {:?}", temp_dir); + } + + // Get absolute path for Docker mount + let temp_dir = temp_dir + .canonicalize() + .expect("Failed to canonicalize remote data directory path"); + + let mut cluster_conf = HashMap::new(); + // set to a small size to make data can be tiered to remote + cluster_conf.insert("log.segment.file-size".to_string(), "120b".to_string()); + cluster_conf.insert( + "remote.log.task-interval-duration".to_string(), + "1s".to_string(), + ); + // remote.data.dir uses the same path in container and host + cluster_conf.insert( + "remote.data.dir".to_string(), + temp_dir.to_string_lossy().to_string(), + ); + + let cluster = + FlussTestingClusterBuilder::new_with_cluster_conf("test_table", &cluster_conf) + .with_remote_data_dir(temp_dir) + .build() + .await; + let mut guard = cluster_guard.write(); + *guard = Some(cluster); + }); + }) + .join() + .expect("Failed to create cluster"); + + // wait for 20 seconds to avoid the error like + // CoordinatorEventProcessor is not initialized yet + sleep(Duration::from_secs(20)); + } + + fn after_all() { + // Create a new tokio runtime in a separate thread + let cluster_guard = SHARED_FLUSS_CLUSTER.clone(); + thread::spawn(move || { + let rt = tokio::runtime::Runtime::new().expect("Failed to create runtime"); + rt.block_on(async { + let mut guard = cluster_guard.write(); + if let Some(cluster) = guard.take() { + cluster.stop().await; + } + }); + }) + .join() + .expect("Failed to cleanup cluster"); + } + + #[tokio::test] + async fn test_scan_remote_log() { + let cluster = get_fluss_cluster(); + let connection = cluster.get_fluss_connection().await; + + let admin = connection.get_admin().await.expect("Failed to get admin"); + + let table_path = TablePath::new( + "fluss".to_string(), + "test_append_record_batch_and_scan".to_string(), + ); + + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("c1", DataTypes::int()) + .column("c2", DataTypes::string()) + .build() + .expect("Failed to build schema"), + ) + .property("table.log.arrow.compression.type", "NONE") + .build() + .expect("Failed to build table"); + + create_table(&admin, &table_path, &table_descriptor).await; + + let table = connection + .get_table(&table_path) + .await + .expect("Failed to get table"); + + let append_writer = table + .new_append() + .expect("Failed to create append") + .create_writer(); + + // append 20 rows, there must be some tiered to remote + let record_count = 20; + for i in 0..record_count { + let mut row = GenericRow::new(); + row.set_field(0, i as i32); + let v = format!("v{}", i); + row.set_field(1, v.as_str()); + append_writer + .append(row) + .await + .expect("Failed to append row"); + } + + // Create a log scanner and subscribe to all buckets to read appended records + let num_buckets = table.table_info().get_num_buckets(); + let log_scanner = table + .new_scan() + .create_log_scanner() + .expect("Failed to create log scanner"); + for bucket_id in 0..num_buckets { + log_scanner + .subscribe(bucket_id, 0) + .await + .expect("Failed to subscribe"); + } + + let mut records = Vec::with_capacity(record_count); + let start = std::time::Instant::now(); + const MAX_WAIT_DURATION: Duration = Duration::from_secs(30); + while records.len() < record_count { + if start.elapsed() > MAX_WAIT_DURATION { + panic!( + "Timed out waiting for {} records; only got {} after {:?}", + record_count, + records.len(), + start.elapsed() + ); + } + let scan_records = log_scanner + .poll(Duration::from_secs(1)) + .await + .expect("Failed to poll log scanner"); + records.extend(scan_records); + } + + // then, check the data + for (i, record) in records.iter().enumerate() { + let row = record.row(); + let expected_c1 = i as i32; + let expected_c2 = format!("v{}", i); + assert_eq!(row.get_int(0), expected_c1, "c1 mismatch at index {}", i); + assert_eq!(row.get_string(1), expected_c2, "c2 mismatch at index {}", i); + } + } + + fn get_fluss_cluster() -> Arc { + let cluster_guard = SHARED_FLUSS_CLUSTER.read(); + if cluster_guard.is_none() { + panic!("Fluss cluster not initialized. Make sure before_all() was called."); + } + Arc::new(cluster_guard.as_ref().unwrap().clone()) + } +} diff --git a/fluss-rust/crates/fluss/tests/test_fluss.rs b/fluss-rust/crates/fluss/tests/test_fluss.rs index a15ca2395a..65111af218 100644 --- a/fluss-rust/crates/fluss/tests/test_fluss.rs +++ b/fluss-rust/crates/fluss/tests/test_fluss.rs @@ -25,4 +25,6 @@ mod integration { mod table; mod utils; + + mod table_remote_scan; } From 5b31e28a9f3c68393db11ed51f32a6e4752fe2b2 Mon Sep 17 00:00:00 2001 From: tison Date: Mon, 15 Dec 2025 10:41:45 +0800 Subject: [PATCH 033/287] chore: upgrade opendal and adopt jiff (#95) Signed-off-by: tison --- fluss-rust/Cargo.toml | 4 +-- fluss-rust/bindings/python/Cargo.toml | 3 +-- fluss-rust/bindings/python/src/lib.rs | 5 ++-- fluss-rust/crates/fluss/Cargo.toml | 13 +++++----- fluss-rust/crates/fluss/{src => }/build.rs | 0 fluss-rust/crates/fluss/src/io/file_io.rs | 6 ++--- fluss-rust/crates/fluss/src/row/datum.rs | 25 +++++++++---------- .../crates/fluss/tests/integration/admin.rs | 7 +++--- .../crates/fluss/tests/integration/table.rs | 6 ++--- .../tests/integration/table_remote_scan.rs | 6 ++--- 10 files changed, 37 insertions(+), 38 deletions(-) rename fluss-rust/crates/fluss/{src => }/build.rs (100%) diff --git a/fluss-rust/Cargo.toml b/fluss-rust/Cargo.toml index b4ac03b7be..284a836349 100644 --- a/fluss-rust/Cargo.toml +++ b/fluss-rust/Cargo.toml @@ -19,13 +19,11 @@ categories = ["command-line-utilities"] description = "The rust implementation of fluss" repository = "https://github.com/apache/fluss-rust" -name = "fluss" edition = "2024" version = "0.1.0" license = "Apache-2.0" rust-version = "1.85" - [workspace] resolver = "2" members = ["crates/fluss", "crates/examples", "bindings/python", "bindings/cpp"] @@ -35,4 +33,4 @@ fluss = { version = "0.1.0", path = "./crates/fluss" } tokio = { version = "1.44.2", features = ["full"] } clap = { version = "4.5.37", features = ["derive"] } arrow = { version = "57.0.0", features = ["ipc_compression"] } -chrono = { version = "0.4", features = ["clock", "std", "wasmbind"] } +jiff = { version = "0.2" } diff --git a/fluss-rust/bindings/python/Cargo.toml b/fluss-rust/bindings/python/Cargo.toml index 9ecc6299cd..4da8bf835e 100644 --- a/fluss-rust/bindings/python/Cargo.toml +++ b/fluss-rust/bindings/python/Cargo.toml @@ -35,5 +35,4 @@ arrow-pyarrow = "57.0.0" arrow-schema = "57.0.0" arrow-array = "57.0.0" pyo3-async-runtimes = { version = "0.26.0", features = ["tokio-runtime"] } -chrono = { workspace = true } -once_cell = "1.21.3" +jiff = { workspace = true } diff --git a/fluss-rust/bindings/python/src/lib.rs b/fluss-rust/bindings/python/src/lib.rs index 63e84b1f86..49d51794ca 100644 --- a/fluss-rust/bindings/python/src/lib.rs +++ b/fluss-rust/bindings/python/src/lib.rs @@ -15,8 +15,9 @@ // specific language governing permissions and limitations // under the License. +use std::sync::LazyLock; + pub use ::fluss as fcore; -use once_cell::sync::Lazy; use pyo3::prelude::*; use tokio::runtime::Runtime; @@ -36,7 +37,7 @@ pub use metadata::*; pub use table::*; pub use utils::*; -static TOKIO_RUNTIME: Lazy = Lazy::new(|| { +static TOKIO_RUNTIME: LazyLock = LazyLock::new(|| { tokio::runtime::Builder::new_multi_thread() .enable_all() .build() diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index 4547b9c327..54235c4107 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -20,7 +20,6 @@ edition = { workspace = true } rust-version = { workspace = true } version = { workspace = true } name = "fluss" -build = "src/build.rs" [features] default = ["storage-memory", "storage-fs"] @@ -52,18 +51,20 @@ rust_decimal = "1" ordered-float = { version = "4", features = ["serde"] } parse-display = "0.10" ref-cast = "1.0" -chrono = { workspace = true } -opendal = "0.53.3" +jiff = { workspace = true } +opendal = "0.55.0" url = "2.5.7" async-trait = "0.1.89" uuid = { version = "1.10", features = ["v4"] } -tempfile= "3.23.0" +tempfile = "3.23.0" + +[target.'cfg(target_arch = "wasm32")'.dependencies] +jiff = { workspace = true, features = ["js"] } [dev-dependencies] testcontainers = "0.25.0" -once_cell = "1.19" test-env-helpers = "0.2.2" [build-dependencies] -prost-build = { version = "0.13.5" } +prost-build = { version = "0.13.5" } diff --git a/fluss-rust/crates/fluss/src/build.rs b/fluss-rust/crates/fluss/build.rs similarity index 100% rename from fluss-rust/crates/fluss/src/build.rs rename to fluss-rust/crates/fluss/build.rs diff --git a/fluss-rust/crates/fluss/src/io/file_io.rs b/fluss-rust/crates/fluss/src/io/file_io.rs index 69a4c9707f..96be06f019 100644 --- a/fluss-rust/crates/fluss/src/io/file_io.rs +++ b/fluss-rust/crates/fluss/src/io/file_io.rs @@ -22,7 +22,7 @@ use std::ops::Range; use std::sync::Arc; use bytes::Bytes; -use chrono::{DateTime, Utc}; +use jiff::Timestamp; use opendal::Operator; use url::Url; @@ -132,7 +132,7 @@ impl InputFile { size: meta.content_length(), is_dir: meta.is_dir(), path: self.path.clone(), - last_modified: meta.last_modified(), + last_modified: meta.last_modified().map(Into::into), }) } @@ -154,5 +154,5 @@ pub struct FileStatus { pub size: u64, pub is_dir: bool, pub path: String, - pub last_modified: Option>, + pub last_modified: Option, } diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs index 3e487039b3..6929b57c5c 100644 --- a/fluss-rust/crates/fluss/src/row/datum.rs +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -15,15 +15,13 @@ // specific language governing permissions and limitations // under the License. -use chrono::Datelike; - use crate::error::Error::RowConvertError; use crate::error::Result; use arrow::array::{ ArrayBuilder, BinaryBuilder, BooleanBuilder, Float32Builder, Float64Builder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, StringBuilder, }; -use chrono::NaiveDate; +use jiff::ToSpan; use ordered_float::OrderedFloat; use parse_display::Display; use ref_cast::RefCast; @@ -35,8 +33,6 @@ use std::ops::Deref; #[allow(dead_code)] const THIRTY_YEARS_MICROSECONDS: i64 = 946_684_800_000_000; -pub const UNIX_EPOCH_DAYS: i32 = 719_163; - #[derive(Debug, Clone, Display, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] pub enum Datum<'a> { #[display("null")] @@ -404,6 +400,8 @@ impl From> for Blob { } } +const UNIX_EPOCH_DAY: jiff::civil::Date = jiff::civil::date(1970, 1, 1); + impl Date { pub const fn new(inner: i32) -> Self { Date(inner) @@ -414,16 +412,17 @@ impl Date { self.0 } - pub fn year(&self) -> i32 { - let date = NaiveDate::from_num_days_from_ce_opt(self.0 + UNIX_EPOCH_DAYS).unwrap(); + pub fn year(&self) -> i16 { + let date = UNIX_EPOCH_DAY + self.0.days(); date.year() } - pub fn month(&self) -> i32 { - let date = NaiveDate::from_num_days_from_ce_opt(self.0 + UNIX_EPOCH_DAYS).unwrap(); - date.month() as i32 + pub fn month(&self) -> i8 { + let date = UNIX_EPOCH_DAY + self.0.days(); + date.month() } - pub fn day(&self) -> i32 { - let date = NaiveDate::from_num_days_from_ce_opt(self.0 + UNIX_EPOCH_DAYS).unwrap(); - date.day() as i32 + + pub fn day(&self) -> i8 { + let date = UNIX_EPOCH_DAY + self.0.days(); + date.day() } } diff --git a/fluss-rust/crates/fluss/tests/integration/admin.rs b/fluss-rust/crates/fluss/tests/integration/admin.rs index c51373d2cb..0086d9c03e 100644 --- a/fluss-rust/crates/fluss/tests/integration/admin.rs +++ b/fluss-rust/crates/fluss/tests/integration/admin.rs @@ -16,16 +16,17 @@ // under the License. use crate::integration::fluss_cluster::FlussTestingCluster; -use once_cell::sync::Lazy; use parking_lot::RwLock; + use std::sync::Arc; +use std::sync::LazyLock; #[cfg(test)] use test_env_helpers::*; // Module-level shared cluster instance (only for this test file) -static SHARED_FLUSS_CLUSTER: Lazy>>> = - Lazy::new(|| Arc::new(RwLock::new(None))); +static SHARED_FLUSS_CLUSTER: LazyLock>>> = + LazyLock::new(|| Arc::new(RwLock::new(None))); #[cfg(test)] #[before_all] diff --git a/fluss-rust/crates/fluss/tests/integration/table.rs b/fluss-rust/crates/fluss/tests/integration/table.rs index b23fd793cd..a058bfe0bc 100644 --- a/fluss-rust/crates/fluss/tests/integration/table.rs +++ b/fluss-rust/crates/fluss/tests/integration/table.rs @@ -16,17 +16,17 @@ * limitations under the License. */ -use once_cell::sync::Lazy; use parking_lot::RwLock; use std::sync::Arc; +use std::sync::LazyLock; use crate::integration::fluss_cluster::FlussTestingCluster; #[cfg(test)] use test_env_helpers::*; // Module-level shared cluster instance (only for this test file) -static SHARED_FLUSS_CLUSTER: Lazy>>> = - Lazy::new(|| Arc::new(RwLock::new(None))); +static SHARED_FLUSS_CLUSTER: LazyLock>>> = + LazyLock::new(|| Arc::new(RwLock::new(None))); #[cfg(test)] #[before_all] diff --git a/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs b/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs index f33d440f40..f52d526e37 100644 --- a/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs +++ b/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs @@ -16,16 +16,16 @@ * limitations under the License. */ use crate::integration::fluss_cluster::FlussTestingCluster; -use once_cell::sync::Lazy; use parking_lot::RwLock; use std::sync::Arc; +use std::sync::LazyLock; #[cfg(test)] use test_env_helpers::*; // Module-level shared cluster instance (only for this test file) -static SHARED_FLUSS_CLUSTER: Lazy>>> = - Lazy::new(|| Arc::new(RwLock::new(None))); +static SHARED_FLUSS_CLUSTER: LazyLock>>> = + LazyLock::new(|| Arc::new(RwLock::new(None))); #[cfg(test)] #[before_all] From d634a7083c5f10efe718cd00151f7ed472164b0a Mon Sep 17 00:00:00 2001 From: tison Date: Mon, 15 Dec 2025 14:31:17 +0800 Subject: [PATCH 034/287] chore: use log over tracing (#96) Signed-off-by: tison --- fluss-rust/crates/fluss/Cargo.toml | 11 +++++------ fluss-rust/crates/fluss/src/client/write/broadcast.rs | 3 +-- fluss-rust/crates/fluss/src/io/file_io.rs | 4 +--- fluss-rust/crates/fluss/src/rpc/server_connection.rs | 11 ++++++----- fluss-rust/crates/fluss/src/util/mod.rs | 3 +-- 5 files changed, 14 insertions(+), 18 deletions(-) diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index 54235c4107..aa763d584e 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -37,24 +37,23 @@ futures = "0.3" clap = { workspace = true } crc32c = "0.6.8" linked-hash-map = "0.5.6" -prost = "0.13.5" +prost = "0.14" rand = "0.9.1" serde = { version = "1.0.219", features = ["derive", "rc"] } serde_json = "1.0.140" -thiserror = "1.0" -tracing = "0.1" +thiserror = "2" +log = { version = "0.4", features = ["kv_std"] } tokio = { workspace = true } parking_lot = "0.12" bytes = "1.10.1" dashmap = "6.1.0" rust_decimal = "1" -ordered-float = { version = "4", features = ["serde"] } +ordered-float = { version = "5", features = ["serde"] } parse-display = "0.10" ref-cast = "1.0" jiff = { workspace = true } opendal = "0.55.0" url = "2.5.7" -async-trait = "0.1.89" uuid = { version = "1.10", features = ["v4"] } tempfile = "3.23.0" @@ -67,4 +66,4 @@ test-env-helpers = "0.2.2" [build-dependencies] -prost-build = { version = "0.13.5" } +prost-build = { version = "0.14" } diff --git a/fluss-rust/crates/fluss/src/client/write/broadcast.rs b/fluss-rust/crates/fluss/src/client/write/broadcast.rs index 2dcc34cbc4..d2e7f0c91c 100644 --- a/fluss-rust/crates/fluss/src/client/write/broadcast.rs +++ b/fluss-rust/crates/fluss/src/client/write/broadcast.rs @@ -19,7 +19,6 @@ use parking_lot::RwLock; use std::sync::Arc; use thiserror::Error; use tokio::sync::Notify; -use tracing::warn; pub type Result = std::result::Result; @@ -111,7 +110,7 @@ where fn drop(&mut self) { let mut data = self.shared.data.write(); if data.is_none() { - warn!("BroadcastOnce dropped without producing"); + log::warn!("BroadcastOnce dropped without producing"); *data = Some(Err(Error::Dropped)); self.shared.notify.notify_waiters(); } diff --git a/fluss-rust/crates/fluss/src/io/file_io.rs b/fluss-rust/crates/fluss/src/io/file_io.rs index 96be06f019..ec3b87ec5b 100644 --- a/fluss-rust/crates/fluss/src/io/file_io.rs +++ b/fluss-rust/crates/fluss/src/io/file_io.rs @@ -97,12 +97,10 @@ impl FileIOBuilder { } } -#[async_trait::async_trait] pub trait FileRead: Send + Unpin + 'static { - async fn read(&self, range: Range) -> Result; + fn read(&self, range: Range) -> impl Future> + Send; } -#[async_trait::async_trait] impl FileRead for opendal::Reader { async fn read(&self, range: Range) -> Result { Ok(opendal::Reader::read(self, range).await?.to_bytes()) diff --git a/fluss-rust/crates/fluss/src/rpc/server_connection.rs b/fluss-rust/crates/fluss/src/rpc/server_connection.rs index 4eeda46063..c474534b61 100644 --- a/fluss-rust/crates/fluss/src/rpc/server_connection.rs +++ b/fluss-rust/crates/fluss/src/rpc/server_connection.rs @@ -37,7 +37,6 @@ use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufStream, WriteHalf}; use tokio::sync::Mutex as AsyncMutex; use tokio::sync::oneshot::{Sender, channel}; use tokio::task::JoinHandle; -use tracing::warn; pub type MessengerTransport = ServerConnectionInner>; @@ -178,8 +177,10 @@ where let header = match ResponseHeader::read_versioned(&mut cursor, ApiVersion(0)) { Ok(header) => header, - Err(e) => { - warn!(%e, "Cannot read message header, ignoring message"); + Err(err) => { + log::warn!( + "Cannot read message header, ignoring message: {err:?}" + ); continue; } }; @@ -189,8 +190,8 @@ where match map.remove(&header.request_id) { Some(active_request) => active_request, _ => { - warn!( - request_id = header.request_id, + log::warn!( + request_id:% = header.request_id; "Got response for unknown request", ); continue; diff --git a/fluss-rust/crates/fluss/src/util/mod.rs b/fluss-rust/crates/fluss/src/util/mod.rs index f93abf967b..d8c0db59d8 100644 --- a/fluss-rust/crates/fluss/src/util/mod.rs +++ b/fluss-rust/crates/fluss/src/util/mod.rs @@ -22,7 +22,6 @@ use std::hash::Hash; use std::path::PathBuf; use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; -use tracing::warn; pub fn current_time_ms() -> i64 { SystemTime::now() @@ -34,7 +33,7 @@ pub fn current_time_ms() -> i64 { pub async fn delete_file(file_path: PathBuf) { tokio::fs::remove_file(&file_path) .await - .unwrap_or_else(|e| warn!("Could not delete file: {:?}, error: {:?}", &file_path, e)); + .unwrap_or_else(|err| log::warn!("Could not delete file: {file_path:?}, error: {err:?}")); } pub struct FairBucketStatusMap { From 746f948837878b2c42b8b5f4394c29f4ac56fcb2 Mon Sep 17 00:00:00 2001 From: AlexZhao Date: Sat, 20 Dec 2025 11:32:10 +0800 Subject: [PATCH 035/287] feat: support s3 as remote segment (#93) --------- Co-authored-by: luoyuxia --- fluss-rust/Cargo.toml | 5 + fluss-rust/crates/examples/Cargo.toml | 2 +- fluss-rust/crates/fluss/Cargo.toml | 11 +- .../crates/fluss/src/client/credentials.rs | 165 ++++++++++++++++++ fluss-rust/crates/fluss/src/client/mod.rs | 2 + .../fluss/src/client/table/remote_log.rs | 69 +++++++- .../crates/fluss/src/client/table/scanner.rs | 9 + fluss-rust/crates/fluss/src/io/mod.rs | 7 +- fluss-rust/crates/fluss/src/io/storage.rs | 16 +- fluss-rust/crates/fluss/src/io/storage_s3.rs | 48 +++++ .../crates/fluss/src/proto/fluss_api.proto | 15 ++ fluss-rust/crates/fluss/src/record/arrow.rs | 67 ++++++- fluss-rust/crates/fluss/src/rpc/api_key.rs | 3 + .../src/rpc/message/get_security_token.rs | 53 ++++++ .../crates/fluss/src/rpc/message/mod.rs | 2 + 15 files changed, 453 insertions(+), 21 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/client/credentials.rs create mode 100644 fluss-rust/crates/fluss/src/io/storage_s3.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/get_security_token.rs diff --git a/fluss-rust/Cargo.toml b/fluss-rust/Cargo.toml index 284a836349..4155ea84b9 100644 --- a/fluss-rust/Cargo.toml +++ b/fluss-rust/Cargo.toml @@ -33,4 +33,9 @@ fluss = { version = "0.1.0", path = "./crates/fluss" } tokio = { version = "1.44.2", features = ["full"] } clap = { version = "4.5.37", features = ["derive"] } arrow = { version = "57.0.0", features = ["ipc_compression"] } +chrono = { version = "0.4", features = ["clock", "std", "wasmbind"] } + +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +opendal = "0.53" jiff = { version = "0.2" } diff --git a/fluss-rust/crates/examples/Cargo.toml b/fluss-rust/crates/examples/Cargo.toml index dab85b66ed..e1fa531764 100644 --- a/fluss-rust/crates/examples/Cargo.toml +++ b/fluss-rust/crates/examples/Cargo.toml @@ -26,7 +26,7 @@ version = { workspace = true } [dependencies] fluss = { workspace = true } tokio = { workspace = true } -clap = { workspace = true} +clap = { workspace = true } [[example]] name = "example-table" path = "src/example_table.rs" \ No newline at end of file diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index aa763d584e..0cf0364773 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -22,11 +22,12 @@ version = { workspace = true } name = "fluss" [features] -default = ["storage-memory", "storage-fs"] -storage-all = ["storage-memory", "storage-fs"] +default = ["storage-memory", "storage-fs", "storage-s3"] +storage-all = ["storage-memory", "storage-fs", "storage-s3"] storage-memory = ["opendal/services-memory"] storage-fs = ["opendal/services-fs"] +storage-s3 = ["opendal/services-s3"] integration_tests = [] [dependencies] @@ -39,9 +40,9 @@ crc32c = "0.6.8" linked-hash-map = "0.5.6" prost = "0.14" rand = "0.9.1" -serde = { version = "1.0.219", features = ["derive", "rc"] } -serde_json = "1.0.140" -thiserror = "2" +serde = { workspace = true, features = ["rc"] } +serde_json = { workspace = true } +thiserror = "1.0" log = { version = "0.4", features = ["kv_std"] } tokio = { workspace = true } parking_lot = "0.12" diff --git a/fluss-rust/crates/fluss/src/client/credentials.rs b/fluss-rust/crates/fluss/src/client/credentials.rs new file mode 100644 index 0000000000..bd2a477a85 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/credentials.rs @@ -0,0 +1,165 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::metadata::Metadata; +use crate::error::{Error, Result}; +use crate::rpc::RpcClient; +use crate::rpc::message::GetSecurityTokenRequest; +use parking_lot::RwLock; +use serde::Deserialize; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +const CACHE_TTL: Duration = Duration::from_secs(3600); + +#[derive(Debug, Deserialize)] +struct Credentials { + access_key_id: String, + access_key_secret: String, + security_token: Option, +} + +struct CachedToken { + access_key_id: String, + secret_access_key: String, + security_token: Option, + addition_infos: HashMap, + cached_at: Instant, +} + +impl CachedToken { + fn to_remote_fs_props(&self) -> HashMap { + let mut props = HashMap::new(); + + props.insert("access_key_id".to_string(), self.access_key_id.clone()); + props.insert( + "secret_access_key".to_string(), + self.secret_access_key.clone(), + ); + + if let Some(token) = &self.security_token { + props.insert("security_token".to_string(), token.clone()); + } + + for (key, value) in &self.addition_infos { + if let Some((opendal_key, transform)) = convert_hadoop_key_to_opendal(key) { + let final_value = if transform { + // Invert boolean value (path_style_access -> enable_virtual_host_style) + if value == "true" { + "false".to_string() + } else { + "true".to_string() + } + } else { + value.clone() + }; + props.insert(opendal_key, final_value); + } + } + + props + } +} + +/// Returns (opendal_key, needs_inversion) +/// needs_inversion is true for path_style_access -> enable_virtual_host_style conversion +fn convert_hadoop_key_to_opendal(hadoop_key: &str) -> Option<(String, bool)> { + match hadoop_key { + "fs.s3a.endpoint" => Some(("endpoint".to_string(), false)), + "fs.s3a.endpoint.region" => Some(("region".to_string(), false)), + "fs.s3a.path.style.access" => Some(("enable_virtual_host_style".to_string(), true)), + "fs.s3a.connection.ssl.enabled" => None, + _ => None, + } +} + +pub struct CredentialsCache { + inner: RwLock>, +} + +impl CredentialsCache { + pub fn new() -> Self { + Self { + inner: RwLock::new(None), + } + } + + pub async fn get_or_refresh( + &self, + rpc_client: &Arc, + metadata: &Arc, + ) -> Result> { + { + let guard = self.inner.read(); + if let Some(cached) = guard.as_ref() { + if cached.cached_at.elapsed() < CACHE_TTL { + return Ok(cached.to_remote_fs_props()); + } + } + } + + self.refresh_from_server(rpc_client, metadata).await + } + + async fn refresh_from_server( + &self, + rpc_client: &Arc, + metadata: &Arc, + ) -> Result> { + let cluster = metadata.get_cluster(); + let server_node = cluster.get_one_available_server(); + let conn = rpc_client.get_connection(server_node).await?; + + let request = GetSecurityTokenRequest::new(); + let response = conn.request(request).await?; + + // the token may be empty if the remote filesystem + // doesn't require token to access + if response.token.is_empty() { + return Ok(HashMap::new()); + } + + let credentials: Credentials = serde_json::from_slice(&response.token).map_err(|e| { + Error::JsonSerdeError(format!("Error when parse token from server: {e}")) + })?; + + let mut addition_infos = HashMap::new(); + for kv in &response.addition_info { + addition_infos.insert(kv.key.clone(), kv.value.clone()); + } + + let cached = CachedToken { + access_key_id: credentials.access_key_id, + secret_access_key: credentials.access_key_secret, + security_token: credentials.security_token, + addition_infos, + cached_at: Instant::now(), + }; + + let props = cached.to_remote_fs_props(); + *self.inner.write() = Some(cached); + + Ok(props) + } +} + +impl Default for CredentialsCache { + fn default() -> Self { + Self::new() + } +} diff --git a/fluss-rust/crates/fluss/src/client/mod.rs b/fluss-rust/crates/fluss/src/client/mod.rs index a971439258..cff218b345 100644 --- a/fluss-rust/crates/fluss/src/client/mod.rs +++ b/fluss-rust/crates/fluss/src/client/mod.rs @@ -17,12 +17,14 @@ mod admin; mod connection; +mod credentials; mod metadata; mod table; mod write; pub use admin::*; pub use connection::*; +pub use credentials::*; pub use metadata::*; pub use table::*; pub use write::*; diff --git a/fluss-rust/crates/fluss/src/client/table/remote_log.rs b/fluss-rust/crates/fluss/src/client/table/remote_log.rs index 65805d069b..a2561f3925 100644 --- a/fluss-rust/crates/fluss/src/client/table/remote_log.rs +++ b/fluss-rust/crates/fluss/src/client/table/remote_log.rs @@ -20,6 +20,7 @@ use crate::metadata::TableBucket; use crate::proto::{PbRemoteLogFetchInfo, PbRemoteLogSegment}; use crate::record::{LogRecordsBatchs, ReadContext, ScanRecord}; use crate::util::delete_file; +use parking_lot::RwLock; use std::collections::HashMap; use std::io; use std::path::{Path, PathBuf}; @@ -115,11 +116,19 @@ impl RemoteLogDownloadFuture { /// Downloader for remote log segment files pub struct RemoteLogDownloader { local_log_dir: TempDir, + remote_fs_props: RwLock>, } impl RemoteLogDownloader { pub fn new(local_log_dir: TempDir) -> Result { - Ok(Self { local_log_dir }) + Ok(Self { + local_log_dir, + remote_fs_props: RwLock::new(HashMap::new()), + }) + } + + pub fn set_remote_fs_props(&self, props: HashMap) { + *self.remote_fs_props.write() = props; } /// Request to fetch a remote log segment to local. This method is non-blocking. @@ -133,10 +142,16 @@ impl RemoteLogDownloader { let local_file_path = self.local_log_dir.path().join(&local_file_name); let remote_path = self.build_remote_path(remote_log_tablet_dir, segment); let remote_log_tablet_dir = remote_log_tablet_dir.to_string(); + let remote_fs_props = self.remote_fs_props.read().clone(); // Spawn async download task tokio::spawn(async move { - let result = - Self::download_file(&remote_log_tablet_dir, &remote_path, &local_file_path).await; + let result = Self::download_file( + &remote_log_tablet_dir, + &remote_path, + &local_file_path, + &remote_fs_props, + ) + .await; let _ = sender.send(result); }); Ok(RemoteLogDownloadFuture::new(receiver)) @@ -157,6 +172,7 @@ impl RemoteLogDownloader { remote_log_tablet_dir: &str, remote_path: &str, local_path: &Path, + remote_fs_props: &HashMap, ) -> Result { // Handle both URL (e.g., "s3://bucket/path") and local file paths // If the path doesn't contain "://", treat it as a local file path @@ -169,11 +185,27 @@ impl RemoteLogDownloader { // Create FileIO from the remote log tablet dir URL to get the storage let file_io_builder = FileIO::from_url(&remote_log_tablet_dir_url)?; + // For S3/S3A URLs, inject S3 credentials from props + let file_io_builder = if remote_log_tablet_dir.starts_with("s3://") + || remote_log_tablet_dir.starts_with("s3a://") + { + file_io_builder.with_props( + remote_fs_props + .iter() + .map(|(k, v)| (k.as_str(), v.as_str())), + ) + } else { + file_io_builder + }; + // Build storage and create operator directly let storage = Storage::build(file_io_builder)?; let (op, relative_path) = storage.create(remote_path)?; - // Get file metadata to know the size + // Timeout for remote storage operations (30 seconds) + const REMOTE_OP_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30); + + // Get file metadata to know the size with timeout let meta = op.stat(relative_path).await?; let file_size = meta.content_length(); @@ -184,13 +216,32 @@ impl RemoteLogDownloader { // opendal::Reader::read accepts a range, so we read in chunks const CHUNK_SIZE: u64 = 8 * 1024 * 1024; // 8MB chunks for efficient reading let mut offset = 0u64; + let mut chunk_count = 0u64; + let total_chunks = file_size.div_ceil(CHUNK_SIZE); while offset < file_size { let end = std::cmp::min(offset + CHUNK_SIZE, file_size); let range = offset..end; - - // Read chunk from remote storage - let chunk = op.read_with(relative_path).range(range.clone()).await?; + chunk_count += 1; + + if chunk_count <= 3 || chunk_count % 10 == 0 { + log::debug!( + "Remote log download: reading chunk {chunk_count}/{total_chunks} (offset {offset})" + ); + } + + // Read chunk from remote storage with timeout + let read_future = op.read_with(relative_path).range(range.clone()); + let chunk = tokio::time::timeout(REMOTE_OP_TIMEOUT, read_future) + .await + .map_err(|_| { + Error::Io(io::Error::new( + io::ErrorKind::TimedOut, + format!( + "Timeout reading chunk from remote storage: {remote_path} at offset {offset}" + ), + )) + })??; let bytes = chunk.to_bytes(); // Write chunk to local file @@ -254,10 +305,10 @@ impl RemotePendingFetch { // delete the downloaded local file to free disk delete_file(file_path).await; - // Parse log records + // Parse log records (remote log contains full data, need client-side projection) let mut fetch_records = vec![]; for log_record in &mut LogRecordsBatchs::new(data) { - fetch_records.extend(log_record.records(&self.read_context)?); + fetch_records.extend(log_record.records_for_remote_log(&self.read_context)?); } let mut result = HashMap::new(); diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index f6780d715c..f66d7d7a14 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -16,6 +16,7 @@ // under the License. use crate::client::connection::FlussConnection; +use crate::client::credentials::CredentialsCache; use crate::client::metadata::Metadata; use crate::error::{Error, Result}; use crate::metadata::{TableBucket, TableInfo, TablePath}; @@ -194,6 +195,7 @@ struct LogFetcher { log_scanner_status: Arc, read_context: ReadContext, remote_log_downloader: RemoteLogDownloader, + credentials_cache: CredentialsCache, } impl LogFetcher { @@ -217,6 +219,7 @@ impl LogFetcher { log_scanner_status, read_context, remote_log_downloader: RemoteLogDownloader::new(tmp_dir)?, + credentials_cache: CredentialsCache::new(), }) } @@ -256,6 +259,12 @@ impl LogFetcher { if let Some(ref remote_log_fetch_info) = fetch_log_for_bucket.remote_log_fetch_info { + let remote_fs_props = self + .credentials_cache + .get_or_refresh(&self.conns, &self.metadata) + .await?; + self.remote_log_downloader + .set_remote_fs_props(remote_fs_props); let remote_fetch_info = RemoteLogFetchInfo::from_proto( remote_log_fetch_info, table_bucket.clone(), diff --git a/fluss-rust/crates/fluss/src/io/mod.rs b/fluss-rust/crates/fluss/src/io/mod.rs index 3c9a1659e4..a03a394529 100644 --- a/fluss-rust/crates/fluss/src/io/mod.rs +++ b/fluss-rust/crates/fluss/src/io/mod.rs @@ -27,8 +27,13 @@ pub use storage::*; mod storage_fs; #[cfg(feature = "storage-fs")] use storage_fs::*; + #[cfg(feature = "storage-memory")] mod storage_memory; - #[cfg(feature = "storage-memory")] use storage_memory::*; + +#[cfg(feature = "storage-s3")] +mod storage_s3; +#[cfg(feature = "storage-s3")] +use storage_s3::*; diff --git a/fluss-rust/crates/fluss/src/io/storage.rs b/fluss-rust/crates/fluss/src/io/storage.rs index 361da7ee80..089670e24c 100644 --- a/fluss-rust/crates/fluss/src/io/storage.rs +++ b/fluss-rust/crates/fluss/src/io/storage.rs @@ -19,6 +19,7 @@ use crate::error; use crate::error::Result; use crate::io::FileIOBuilder; use opendal::{Operator, Scheme}; +use std::collections::HashMap; /// The storage carries all supported storage services in fluss #[derive(Debug)] @@ -27,11 +28,13 @@ pub enum Storage { Memory, #[cfg(feature = "storage-fs")] LocalFs, + #[cfg(feature = "storage-s3")] + S3 { props: HashMap }, } impl Storage { pub(crate) fn build(file_io_builder: FileIOBuilder) -> Result { - let (scheme_str, _) = file_io_builder.into_parts(); + let (scheme_str, props) = file_io_builder.into_parts(); let scheme = Self::parse_scheme(&scheme_str)?; match scheme { @@ -39,6 +42,8 @@ impl Storage { Scheme::Memory => Ok(Self::Memory), #[cfg(feature = "storage-fs")] Scheme::Fs => Ok(Self::LocalFs), + #[cfg(feature = "storage-s3")] + Scheme::S3 => Ok(Self::S3 { props }), _ => Err(error::Error::IoUnsupported( "Unsupported storage feature".to_string(), )), @@ -66,6 +71,14 @@ impl Storage { Ok((op, &path[1..])) } } + #[cfg(feature = "storage-s3")] + Storage::S3 { props } => { + let (bucket, key) = super::parse_s3_path(path); + let mut s3_props = props.clone(); + s3_props.insert("bucket".to_string(), bucket.to_string()); + let op = super::s3_config_build(&s3_props)?; + Ok((op, key)) + } } } @@ -73,6 +86,7 @@ impl Storage { match scheme { "memory" => Ok(Scheme::Memory), "file" | "" => Ok(Scheme::Fs), + "s3" | "s3a" => Ok(Scheme::S3), s => Ok(s.parse::()?), } } diff --git a/fluss-rust/crates/fluss/src/io/storage_s3.rs b/fluss-rust/crates/fluss/src/io/storage_s3.rs new file mode 100644 index 0000000000..8000d091dd --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/storage_s3.rs @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Result; +use opendal::Configurator; +use opendal::Operator; +use opendal::layers::TimeoutLayer; +use opendal::services::S3Config; +use std::collections::HashMap; +use std::time::Duration; + +pub(crate) fn s3_config_build(props: &HashMap) -> Result { + let config = S3Config::from_iter(props.clone())?; + let op = Operator::from_config(config)?.finish(); + + // Add timeout layer to prevent hanging on S3 operations + let timeout_layer = TimeoutLayer::new() + .with_timeout(Duration::from_secs(10)) + .with_io_timeout(Duration::from_secs(30)); + + Ok(op.layer(timeout_layer)) +} + +pub(crate) fn parse_s3_path(path: &str) -> (&str, &str) { + let path = path + .strip_prefix("s3a://") + .or_else(|| path.strip_prefix("s3://")) + .unwrap_or(path); + + match path.find('/') { + Some(idx) => (&path[..idx], &path[idx + 1..]), + None => (path, ""), + } +} diff --git a/fluss-rust/crates/fluss/src/proto/fluss_api.proto b/fluss-rust/crates/fluss/src/proto/fluss_api.proto index ef460fc559..e59c2d9dd6 100644 --- a/fluss-rust/crates/fluss/src/proto/fluss_api.proto +++ b/fluss-rust/crates/fluss/src/proto/fluss_api.proto @@ -297,4 +297,19 @@ message PbLakeSnapshotForBucket { optional int64 partition_id = 1; required int32 bucket_id = 2; optional int64 log_offset = 3; +} + +message PbKeyValue { + required string key = 1; + required string value = 2; +} + +message GetFileSystemSecurityTokenRequest { +} + +message GetFileSystemSecurityTokenResponse { + required string schema = 1; + required bytes token = 2; + optional int64 expiration_time = 3; + repeated PbKeyValue addition_info = 4; } \ No newline at end of file diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index 806c9a5824..f079f09dfb 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -504,6 +504,30 @@ impl<'a> LogRecordBatch<'a> { }; Ok(log_record_iterator) } + + pub fn records_for_remote_log(&self, read_context: &ReadContext) -> Result { + if self.record_count() == 0 { + return Ok(LogRecordIterator::empty()); + } + + let data = &self.data[RECORDS_OFFSET..]; + + let record_batch = read_context.record_batch_for_remote_log(data)?; + let log_record_iterator = match record_batch { + None => LogRecordIterator::empty(), + Some(record_batch) => { + let arrow_reader = ArrowReader::new(Arc::new(record_batch)); + LogRecordIterator::Arrow(ArrowLogRecordIterator { + reader: arrow_reader, + base_offset: self.base_log_offset(), + timestamp: self.commit_timestamp(), + row_id: 0, + change_type: ChangeType::AppendOnly, + }) + } + }; + Ok(log_record_iterator) + } } /// Parse an Arrow IPC message from a byte slice. @@ -552,7 +576,8 @@ fn parse_ipc_message( let message = root_as_message(metadata_bytes).ok()?; let batch_metadata = message.header_as_record_batch()?; - let body_start = 8 + metadata_size; + let metadata_padded_size = (metadata_size + 7) & !7; + let body_start = 8 + metadata_padded_size; let body_data = &data[body_start..]; let body_buffer = Buffer::from(body_data); @@ -677,7 +702,7 @@ pub fn to_arrow_type(fluss_type: &DataType) -> ArrowDataType { #[derive(Clone)] pub struct ReadContext { target_schema: SchemaRef, - + full_schema: SchemaRef, projection: Option, } @@ -694,7 +719,8 @@ struct Projection { impl ReadContext { pub fn new(arrow_schema: SchemaRef) -> ReadContext { ReadContext { - target_schema: arrow_schema, + target_schema: arrow_schema.clone(), + full_schema: arrow_schema, projection: None, } } @@ -730,7 +756,10 @@ impl ReadContext { } } else { Projection { - ordered_schema: Self::project_schema(arrow_schema, projected_fields.as_slice()), + ordered_schema: Self::project_schema( + arrow_schema.clone(), + projected_fields.as_slice(), + ), ordered_fields: projected_fields.clone(), projected_fields, reordering_indexes: vec![], @@ -741,6 +770,7 @@ impl ReadContext { ReadContext { target_schema, + full_schema: arrow_schema, projection: Some(project), } } @@ -809,6 +839,35 @@ impl ReadContext { }; Ok(Some(record_batch)) } + + pub fn record_batch_for_remote_log(&self, data: &[u8]) -> Result> { + let (batch_metadata, body_buffer, version) = match parse_ipc_message(data) { + Some(result) => result, + None => return Ok(None), + }; + + let record_batch = read_record_batch( + &body_buffer, + batch_metadata, + self.full_schema.clone(), + &std::collections::HashMap::new(), + None, + &version, + )?; + + let record_batch = match &self.projection { + Some(projection) => { + let projected_columns: Vec<_> = projection + .projected_fields + .iter() + .map(|&idx| record_batch.column(idx).clone()) + .collect(); + RecordBatch::try_new(self.target_schema.clone(), projected_columns)? + } + None => record_batch, + }; + Ok(Some(record_batch)) + } } pub enum LogRecordIterator { diff --git a/fluss-rust/crates/fluss/src/rpc/api_key.rs b/fluss-rust/crates/fluss/src/rpc/api_key.rs index 215bb39389..b11647f960 100644 --- a/fluss-rust/crates/fluss/src/rpc/api_key.rs +++ b/fluss-rust/crates/fluss/src/rpc/api_key.rs @@ -32,6 +32,7 @@ pub enum ApiKey { ProduceLog, FetchLog, ListOffsets, + GetFileSystemSecurityToken, GetDatabaseInfo, GetLatestLakeSnapshot, Unknown(i16), @@ -53,6 +54,7 @@ impl From for ApiKey { 1014 => ApiKey::ProduceLog, 1015 => ApiKey::FetchLog, 1021 => ApiKey::ListOffsets, + 1025 => ApiKey::GetFileSystemSecurityToken, 1032 => ApiKey::GetLatestLakeSnapshot, 1035 => ApiKey::GetDatabaseInfo, _ => Unknown(key), @@ -76,6 +78,7 @@ impl From for i16 { ApiKey::ProduceLog => 1014, ApiKey::FetchLog => 1015, ApiKey::ListOffsets => 1021, + ApiKey::GetFileSystemSecurityToken => 1025, ApiKey::GetLatestLakeSnapshot => 1032, ApiKey::GetDatabaseInfo => 1035, Unknown(x) => x, diff --git a/fluss-rust/crates/fluss/src/rpc/message/get_security_token.rs b/fluss-rust/crates/fluss/src/rpc/message/get_security_token.rs new file mode 100644 index 0000000000..7995232d1a --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/get_security_token.rs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::proto::{GetFileSystemSecurityTokenRequest, GetFileSystemSecurityTokenResponse}; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; +use crate::{impl_read_version_type, impl_write_version_type}; +use bytes::{Buf, BufMut}; +use prost::Message; + +#[derive(Debug)] +pub struct GetSecurityTokenRequest { + pub inner_request: GetFileSystemSecurityTokenRequest, +} + +impl GetSecurityTokenRequest { + pub fn new() -> Self { + Self { + inner_request: GetFileSystemSecurityTokenRequest {}, + } + } +} + +impl Default for GetSecurityTokenRequest { + fn default() -> Self { + Self::new() + } +} + +impl RequestBody for GetSecurityTokenRequest { + type ResponseBody = GetFileSystemSecurityTokenResponse; + const API_KEY: ApiKey = ApiKey::GetFileSystemSecurityToken; + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(GetSecurityTokenRequest); +impl_read_version_type!(GetFileSystemSecurityTokenResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/mod.rs b/fluss-rust/crates/fluss/src/rpc/message/mod.rs index 230d971a49..0ed5b7c0e6 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/mod.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/mod.rs @@ -28,6 +28,7 @@ mod drop_table; mod fetch; mod get_database_info; mod get_latest_lake_snapshot; +mod get_security_token; mod get_table; mod header; mod list_databases; @@ -45,6 +46,7 @@ pub use drop_table::*; pub use fetch::*; pub use get_database_info::*; pub use get_latest_lake_snapshot::*; +pub use get_security_token::*; pub use get_table::*; pub use header::*; pub use list_databases::*; From 57ebe0c49e70f4a1fd63631c9010d4f93079d81d Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Sat, 20 Dec 2025 03:37:01 +0000 Subject: [PATCH 036/287] chore: parse_ipc_message should return exception instead of return None if parse failed (#97) --------- Co-authored-by: luoyuxia --- fluss-rust/crates/fluss/src/error.rs | 2 +- fluss-rust/crates/fluss/src/record/arrow.rs | 112 +++++++++++++++----- 2 files changed, 84 insertions(+), 30 deletions(-) diff --git a/fluss-rust/crates/fluss/src/error.rs b/fluss-rust/crates/fluss/src/error.rs index b1d5d13b6e..63438b1966 100644 --- a/fluss-rust/crates/fluss/src/error.rs +++ b/fluss-rust/crates/fluss/src/error.rs @@ -39,7 +39,7 @@ pub enum Error { #[error("Row convert error")] RowConvertError(String), - #[error("arrow error")] + #[error("Arrow error: {0}")] ArrowError(#[from] ArrowError), #[error("Write error: {0}")] diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index f079f09dfb..6e8cb55962 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -34,6 +34,7 @@ use arrow::{ writer::StreamWriter, }, }; +use arrow_schema::ArrowError::ParseError; use arrow_schema::SchemaRef; use arrow_schema::{DataType as ArrowDataType, Field}; use byteorder::WriteBytesExt; @@ -489,19 +490,15 @@ impl<'a> LogRecordBatch<'a> { let data = &self.data[RECORDS_OFFSET..]; let record_batch = read_context.record_batch(data)?; - let log_record_iterator = match record_batch { - None => LogRecordIterator::empty(), - Some(record_batch) => { - let arrow_reader = ArrowReader::new(Arc::new(record_batch)); - LogRecordIterator::Arrow(ArrowLogRecordIterator { - reader: arrow_reader, - base_offset: self.base_log_offset(), - timestamp: self.commit_timestamp(), - row_id: 0, - change_type: ChangeType::AppendOnly, - }) - } - }; + let arrow_reader = ArrowReader::new(Arc::new(record_batch)); + let log_record_iterator = LogRecordIterator::Arrow(ArrowLogRecordIterator { + reader: arrow_reader, + base_offset: self.base_log_offset(), + timestamp: self.commit_timestamp(), + row_id: 0, + change_type: ChangeType::AppendOnly, + }); + Ok(log_record_iterator) } @@ -542,15 +539,16 @@ impl<'a> LogRecordBatch<'a> { /// * `data` - The byte slice containing the IPC message. /// /// # Returns -/// Returns `Some((batch_metadata, body_buffer, version))` on success: +/// Returns `Ok((batch_metadata, body_buffer, version))` on success: /// - `batch_metadata`: The RecordBatch metadata from the IPC message. /// - `body_buffer`: The buffer containing the record batch body data. /// - `version`: The Arrow IPC metadata version. /// -/// Returns `None` if the data is malformed or too short. +/// Returns `Err(arrow_error)` on errors +/// - `arrow_error`: Error details e.g. malformed, too short or bad continuation marker. fn parse_ipc_message( data: &[u8], -) -> Option<( +) -> Result<( arrow::ipc::RecordBatch<'_>, Buffer, arrow::ipc::MetadataVersion, @@ -558,30 +556,38 @@ fn parse_ipc_message( const CONTINUATION_MARKER: u32 = 0xFFFFFFFF; if data.len() < 8 { - return None; + Err(ParseError(format!("Invalid data length: {}", data.len())))? } let continuation = LittleEndian::read_u32(&data[0..4]); let metadata_size = LittleEndian::read_u32(&data[4..8]) as usize; if continuation != CONTINUATION_MARKER { - return None; + Err(ParseError(format!( + "Invalid continuation marker: {continuation}" + )))? } if data.len() < 8 + metadata_size { - return None; + Err(ParseError(format!( + "Invalid data length. Remaining data length {} is shorter than specified size {}", + data.len() - 8, + metadata_size + )))? } let metadata_bytes = &data[8..8 + metadata_size]; - let message = root_as_message(metadata_bytes).ok()?; - let batch_metadata = message.header_as_record_batch()?; + let message = root_as_message(metadata_bytes).map_err(|err| ParseError(err.to_string()))?; + let batch_metadata = message + .header_as_record_batch() + .ok_or(ParseError(String::from("Not a record batch")))?; let metadata_padded_size = (metadata_size + 7) & !7; let body_start = 8 + metadata_padded_size; let body_data = &data[body_start..]; let body_buffer = Buffer::from(body_data); - Some((batch_metadata, body_buffer, message.version())) + Ok((batch_metadata, body_buffer, message.version())) } pub fn to_arrow_schema(fluss_schema: &DataType) -> SchemaRef { @@ -602,7 +608,7 @@ pub fn to_arrow_schema(fluss_schema: &DataType) -> SchemaRef { SchemaRef::new(arrow_schema::Schema::new(fields)) } _ => { - panic!("must be row data tyoe.") + panic!("must be row data type.") } } } @@ -796,11 +802,8 @@ impl ReadContext { .map(|p| p.ordered_fields.as_slice()) } - pub fn record_batch(&self, data: &[u8]) -> Result> { - let (batch_metadata, body_buffer, version) = match parse_ipc_message(data) { - Some(result) => result, - None => return Ok(None), - }; + pub fn record_batch(&self, data: &[u8]) -> Result { + let (batch_metadata, body_buffer, version) = parse_ipc_message(data)?; // the record batch from server must be ordered by field pos, // according to project to decide what arrow schema to use @@ -837,7 +840,7 @@ impl ReadContext { } _ => record_batch, }; - Ok(Some(record_batch)) + Ok(record_batch) } pub fn record_batch_for_remote_log(&self, data: &[u8]) -> Result> { @@ -1076,4 +1079,55 @@ mod tests { fn test_timestamp_ltz_invalid_precision() { to_arrow_type(&DataTypes::timestamp_ltz_with_precision(10)); } + + #[test] + fn test_parse_ipc_message() { + let empty_body: &[u8] = &le_bytes(&[0xFFFFFFFF, 0x00000000]); + let result = parse_ipc_message(empty_body); + assert_eq!( + result.unwrap_err().to_string(), + String::from("Arrow error: Parser error: Range [0, 4) is out of bounds.\n\n") + ); + + let invalid_data = &[]; + assert_eq!( + parse_ipc_message(invalid_data).unwrap_err().to_string(), + String::from("Arrow error: Parser error: Invalid data length: 0") + ); + + let data_with_invalid_continuation: &[u8] = &le_bytes(&[0x00000001, 0x00000000]); + assert_eq!( + parse_ipc_message(data_with_invalid_continuation) + .unwrap_err() + .to_string(), + String::from("Arrow error: Parser error: Invalid continuation marker: 1") + ); + + let data_with_invalid_length: &[u8] = &le_bytes(&[0xFFFFFFFF, 0x00000001]); + assert_eq!( + parse_ipc_message(data_with_invalid_length) + .unwrap_err() + .to_string(), + String::from( + "Arrow error: Parser error: Invalid data length. \ + Remaining data length 0 is shorter than specified size 1" + ) + ); + + let data_with_invalid_length = &le_bytes(&[0xFFFFFFFF, 0x00000004, 0x00000000]); + assert_eq!( + parse_ipc_message(data_with_invalid_length) + .unwrap_err() + .to_string(), + String::from("Arrow error: Parser error: Not a record batch") + ); + } + + fn le_bytes(vals: &[u32]) -> Vec { + let mut out = Vec::with_capacity(vals.len() * 4); + for &v in vals { + out.extend_from_slice(&v.to_le_bytes()); + } + out + } } From e585d1f0aafa890f74ffb1da8aab09908345fb02 Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Sat, 20 Dec 2025 12:04:20 +0800 Subject: [PATCH 037/287] chore: Improve error (#77) --- fluss-rust/crates/fluss/Cargo.toml | 1 + fluss-rust/crates/fluss/src/client/admin.rs | 28 +- .../crates/fluss/src/client/credentials.rs | 7 +- .../fluss/src/client/table/remote_log.rs | 29 +- .../crates/fluss/src/client/table/scanner.rs | 30 +- .../crates/fluss/src/client/write/mod.rs | 10 +- .../crates/fluss/src/client/write/sender.rs | 11 +- .../fluss/src/client/write/writer_client.rs | 17 +- fluss-rust/crates/fluss/src/error.rs | 139 +++++-- fluss-rust/crates/fluss/src/io/file_io.rs | 5 +- fluss-rust/crates/fluss/src/io/storage.rs | 6 +- .../crates/fluss/src/metadata/database.rs | 22 +- .../crates/fluss/src/metadata/json_serde.rs | 179 +++++---- fluss-rust/crates/fluss/src/metadata/table.rs | 88 +++-- .../crates/fluss/src/proto/fluss_api.proto | 5 + fluss-rust/crates/fluss/src/record/arrow.rs | 24 +- fluss-rust/crates/fluss/src/row/datum.rs | 34 +- fluss-rust/crates/fluss/src/rpc/error.rs | 4 + .../crates/fluss/src/rpc/fluss_api_error.rs | 371 ++++++++++++++++++ fluss-rust/crates/fluss/src/rpc/frame.rs | 4 + .../fluss/src/rpc/message/create_database.rs | 3 +- .../fluss/src/rpc/message/create_table.rs | 3 +- .../fluss/src/rpc/message/database_exists.rs | 4 +- .../fluss/src/rpc/message/drop_database.rs | 4 +- .../fluss/src/rpc/message/drop_table.rs | 4 +- .../crates/fluss/src/rpc/message/fetch.rs | 4 +- .../src/rpc/message/get_database_info.rs | 4 +- .../rpc/message/get_latest_lake_snapshot.rs | 4 +- .../crates/fluss/src/rpc/message/get_table.rs | 4 +- .../crates/fluss/src/rpc/message/header.rs | 18 +- .../fluss/src/rpc/message/list_databases.rs | 4 +- .../fluss/src/rpc/message/list_offsets.rs | 19 +- .../fluss/src/rpc/message/list_tables.rs | 4 +- .../crates/fluss/src/rpc/message/mod.rs | 1 + .../fluss/src/rpc/message/produce_log.rs | 4 +- .../fluss/src/rpc/message/table_exists.rs | 5 +- .../fluss/src/rpc/message/update_metadata.rs | 4 +- fluss-rust/crates/fluss/src/rpc/mod.rs | 4 +- .../crates/fluss/src/rpc/server_connection.rs | 24 +- .../crates/fluss/tests/integration/admin.rs | 32 ++ .../tests/integration/table_remote_scan.rs | 14 +- 41 files changed, 908 insertions(+), 273 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/rpc/fluss_api_error.rs diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index 0cf0364773..cdba9de5a4 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -57,6 +57,7 @@ opendal = "0.55.0" url = "2.5.7" uuid = { version = "1.10", features = ["v4"] } tempfile = "3.23.0" +snafu = "0.8.3" [target.'cfg(target_arch = "wasm32")'.dependencies] jiff = { workspace = true, features = ["js"] } diff --git a/fluss-rust/crates/fluss/src/client/admin.rs b/fluss-rust/crates/fluss/src/client/admin.rs index fefab43520..e185af84ac 100644 --- a/fluss-rust/crates/fluss/src/client/admin.rs +++ b/fluss-rust/crates/fluss/src/client/admin.rs @@ -29,7 +29,7 @@ use crate::rpc::message::{ListOffsetsRequest, OffsetSpec}; use crate::rpc::{RpcClient, ServerConnection}; use crate::BucketId; -use crate::error::Result; +use crate::error::{Error, Result}; use crate::proto::GetTableInfoResponse; use std::collections::HashMap; use std::slice::from_ref; @@ -245,10 +245,10 @@ impl FlussAdmin { let mut results = HashMap::new(); for response_future in response_futures { - let offsets = response_future.await.map_err( - // todo: consider use suitable error - |e| crate::error::Error::WriteError(format!("Fail to get result: {e}")), - )?; + let offsets = response_future.await.map_err(|e| Error::UnexpectedError { + message: "Fail to get result for list offsets.".to_string(), + source: Some(Box::new(e)), + })?; results.extend(offsets?); } Ok(results) @@ -267,10 +267,11 @@ impl FlussAdmin { for bucket_id in buckets { let table_bucket = TableBucket::new(table_id, *bucket_id); let leader = cluster.leader_for(&table_bucket).ok_or_else(|| { - // todo: consider use another suitable error - crate::error::Error::InvalidTableError(format!( - "No leader found for table bucket: table_id={table_id}, bucket_id={bucket_id}" - )) + // todo: consider retry? + Error::UnexpectedError { + message: format!("No leader found for table bucket: {table_bucket}."), + source: None, + } })?; node_for_bucket_list @@ -301,10 +302,11 @@ impl FlussAdmin { let task = tokio::spawn(async move { let cluster = metadata.get_cluster(); let tablet_server = cluster.get_tablet_server(leader_id).ok_or_else(|| { - // todo: consider use more suitable error - crate::error::Error::InvalidTableError(format!( - "Tablet server {leader_id} not found" - )) + Error::LeaderNotAvailable { + message: format!( + "Tablet server {leader_id} is not found in metadata cache." + ), + } })?; let connection = rpc_client.get_connection(tablet_server).await?; let list_offsets_response = connection.request(request).await?; diff --git a/fluss-rust/crates/fluss/src/client/credentials.rs b/fluss-rust/crates/fluss/src/client/credentials.rs index bd2a477a85..6b07d08eb1 100644 --- a/fluss-rust/crates/fluss/src/client/credentials.rs +++ b/fluss-rust/crates/fluss/src/client/credentials.rs @@ -134,9 +134,10 @@ impl CredentialsCache { return Ok(HashMap::new()); } - let credentials: Credentials = serde_json::from_slice(&response.token).map_err(|e| { - Error::JsonSerdeError(format!("Error when parse token from server: {e}")) - })?; + let credentials: Credentials = + serde_json::from_slice(&response.token).map_err(|e| Error::JsonSerdeError { + message: format!("Error when parse token from server: {e}"), + })?; let mut addition_infos = HashMap::new(); for kv in &response.addition_info { diff --git a/fluss-rust/crates/fluss/src/client/table/remote_log.rs b/fluss-rust/crates/fluss/src/client/table/remote_log.rs index a2561f3925..10273dde2e 100644 --- a/fluss-rust/crates/fluss/src/client/table/remote_log.rs +++ b/fluss-rust/crates/fluss/src/client/table/remote_log.rs @@ -100,15 +100,14 @@ impl RemoteLogDownloadFuture { /// Get the downloaded file path pub async fn get_file_path(&mut self) -> Result { - let receiver = self - .receiver - .take() - .ok_or_else(|| Error::Io(io::Error::other("Download future already consumed")))?; - - receiver.await.map_err(|e| { - Error::Io(io::Error::other(format!( - "Download future cancelled: {e:?}" - ))) + let receiver = self.receiver.take().ok_or_else(|| Error::UnexpectedError { + message: "Downloaded file already consumed".to_string(), + source: None, + })?; + + receiver.await.map_err(|e| Error::UnexpectedError { + message: format!("Download future cancelled: {e:?}"), + source: None, })? } } @@ -234,13 +233,13 @@ impl RemoteLogDownloader { let read_future = op.read_with(relative_path).range(range.clone()); let chunk = tokio::time::timeout(REMOTE_OP_TIMEOUT, read_future) .await - .map_err(|_| { - Error::Io(io::Error::new( - io::ErrorKind::TimedOut, - format!( - "Timeout reading chunk from remote storage: {remote_path} at offset {offset}" + .map_err(|e| { + Error::IoUnexpectedError { + message: format!( + "Timeout reading chunk from remote storage: {remote_path} at offset {offset}, exception: {e}." ), - )) + source: io::ErrorKind::TimedOut.into(), + } })??; let bytes = chunk.to_bytes(); diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index f66d7d7a14..1e70649e0b 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -74,18 +74,20 @@ impl<'a> TableScan<'a> { /// ``` pub fn project(mut self, column_indices: &[usize]) -> Result { if column_indices.is_empty() { - return Err(Error::IllegalArgument( - "Column indices cannot be empty".to_string(), - )); + return Err(Error::IllegalArgument { + message: "Column indices cannot be empty".to_string(), + }); } let field_count = self.table_info.row_type().fields().len(); for &idx in column_indices { if idx >= field_count { - return Err(Error::IllegalArgument(format!( - "Column index {} out of range (max: {})", - idx, - field_count - 1 - ))); + return Err(Error::IllegalArgument { + message: format!( + "Column index {} out of range (max: {})", + idx, + field_count - 1 + ), + }); } } self.projected_fields = Some(column_indices.to_vec()); @@ -106,9 +108,9 @@ impl<'a> TableScan<'a> { /// ``` pub fn project_by_name(mut self, column_names: &[&str]) -> Result { if column_names.is_empty() { - return Err(Error::IllegalArgument( - "Column names cannot be empty".to_string(), - )); + return Err(Error::IllegalArgument { + message: "Column names cannot be empty".to_string(), + }); } let row_type = self.table_info.row_type(); let mut indices = Vec::new(); @@ -118,7 +120,9 @@ impl<'a> TableScan<'a> { .fields() .iter() .position(|f| f.name() == *name) - .ok_or_else(|| Error::IllegalArgument(format!("Column '{name}' not found")))?; + .ok_or_else(|| Error::IllegalArgument { + message: format!("Column '{name}' not found"), + })?; indices.push(idx); } @@ -277,7 +281,7 @@ impl LogFetcher { // Download and process remote log segments let mut pos_in_log_segment = remote_fetch_info.first_start_pos; let mut current_fetch_offset = fetch_offset; - // todo: make segment download parallelly + // todo: make segment download in parallel for (i, segment) in remote_fetch_info.remote_log_segments.iter().enumerate() { diff --git a/fluss-rust/crates/fluss/src/client/write/mod.rs b/fluss-rust/crates/fluss/src/client/write/mod.rs index e632cde451..cd33586c89 100644 --- a/fluss-rust/crates/fluss/src/client/write/mod.rs +++ b/fluss-rust/crates/fluss/src/client/write/mod.rs @@ -74,11 +74,17 @@ impl ResultHandle { self.receiver .receive() .await - .map_err(|e| Error::WriteError(e.to_string())) + .map_err(|e| Error::UnexpectedError { + message: format!("Fail to wait write result {e:?}"), + source: None, + }) } pub fn result(&self, batch_result: BatchWriteResult) -> Result<(), Error> { // do nothing, just return empty result - batch_result.map_err(|e| Error::WriteError(e.to_string())) + batch_result.map_err(|e| Error::UnexpectedError { + message: format!("Fail to get write result {e:?}"), + source: None, + }) } } diff --git a/fluss-rust/crates/fluss/src/client/write/sender.rs b/fluss-rust/crates/fluss/src/client/write/sender.rs index 27460e3863..462a846d6c 100644 --- a/fluss-rust/crates/fluss/src/client/write/sender.rs +++ b/fluss-rust/crates/fluss/src/client/write/sender.rs @@ -17,7 +17,7 @@ use crate::client::metadata::Metadata; use crate::client::{ReadyWriteBatch, RecordAccumulator}; -use crate::error::Error::WriteError; +use crate::error::Error; use crate::error::Result; use crate::metadata::TableBucket; use crate::proto::ProduceLogResponse; @@ -150,9 +150,12 @@ impl Sender { let cluster = self.metadata.get_cluster(); - let destination_node = cluster - .get_tablet_server(destination) - .ok_or(WriteError(String::from("destination node not found")))?; + let destination_node = + cluster + .get_tablet_server(destination) + .ok_or(Error::LeaderNotAvailable { + message: format!("destination node not found in metadata cache {destination}."), + })?; let connection = self.metadata.get_connection(destination_node).await?; for (table_id, write_batches) in write_batch_by_table { diff --git a/fluss-rust/crates/fluss/src/client/write/writer_client.rs b/fluss-rust/crates/fluss/src/client/write/writer_client.rs index 28f5371e8d..042859afb4 100644 --- a/fluss-rust/crates/fluss/src/client/write/writer_client.rs +++ b/fluss-rust/crates/fluss/src/client/write/writer_client.rs @@ -78,11 +78,12 @@ impl WriterClient { fn get_ack(config: &Config) -> Result { let acks = config.writer_acks.as_str(); - if acks.eq("all") { + if acks.eq_ignore_ascii_case("all") { Ok(-1) } else { - acks.parse::() - .map_err(|e| Error::IllegalArgument(e.to_string())) + acks.parse::().map_err(|e| Error::IllegalArgument { + message: format!("invalid writer ack '{acks}': {e}"), + }) } } @@ -133,11 +134,17 @@ impl WriterClient { self.shutdown_tx .send(()) .await - .map_err(|e| Error::WriteError(e.to_string()))?; + .map_err(|e| Error::UnexpectedError { + message: format!("Failed to close write client: {e:?}"), + source: None, + })?; self.sender_join_handle .await - .map_err(|e| Error::WriteError(e.to_string()))?; + .map_err(|e| Error::UnexpectedError { + message: format!("Failed to close write client: {e:?}"), + source: None, + })?; Ok(()) } diff --git a/fluss-rust/crates/fluss/src/error.rs b/fluss-rust/crates/fluss/src/error.rs index 63438b1966..0f4b1b6d11 100644 --- a/fluss-rust/crates/fluss/src/error.rs +++ b/fluss-rust/crates/fluss/src/error.rs @@ -15,48 +15,137 @@ // specific language governing permissions and limitations // under the License. -use crate::rpc::RpcError; +pub use crate::rpc::RpcError; +pub use crate::rpc::{ApiError, FlussError}; + use arrow_schema::ArrowError; +use snafu::Snafu; use std::{io, result}; -use thiserror::Error; pub type Result = result::Result; -#[derive(Debug, Error)] +#[derive(Debug, Snafu)] pub enum Error { - #[error(transparent)] - Io(#[from] io::Error), + #[snafu( + whatever, + display("Fluss hitting unexpected error {}: {:?}", message, source) + )] + UnexpectedError { + message: String, + /// see https://github.com/shepmaster/snafu/issues/446 + #[snafu(source(from(Box, Some)))] + source: Option>, + }, + + #[snafu( + visibility(pub(crate)), + display("Fluss hitting unexpected io error {}: {:?}", message, source) + )] + IoUnexpectedError { message: String, source: io::Error }, + + #[snafu( + visibility(pub(crate)), + display( + "Fluss hitting remote storage unexpected error {}: {:?}", + message, + source + ) + )] + RemoteStorageUnexpectedError { + message: String, + source: opendal::Error, + }, + + #[snafu( + visibility(pub(crate)), + display("Fluss hitting invalid table error {}.", message) + )] + InvalidTableError { message: String }, - #[error("Invalid table")] - InvalidTableError(String), + #[snafu( + visibility(pub(crate)), + display("Fluss hitting json serde error {}.", message) + )] + JsonSerdeError { message: String }, - #[error("Json serde error")] - JsonSerdeError(String), + #[snafu( + visibility(pub(crate)), + display("Fluss hitting unexpected rpc error {}: {:?}", message, source) + )] + RpcError { message: String, source: RpcError }, - #[error("Rpc error")] - RpcError(#[from] RpcError), + #[snafu( + visibility(pub(crate)), + display("Fluss hitting row convert error {}.", message) + )] + RowConvertError { message: String }, - #[error("Row convert error")] - RowConvertError(String), + #[snafu( + visibility(pub(crate)), + display("Fluss hitting Arrow error {}: {:?}.", message, source) + )] + ArrowError { message: String, source: ArrowError }, - #[error("Arrow error: {0}")] - ArrowError(#[from] ArrowError), + #[snafu( + visibility(pub(crate)), + display("Fluss hitting illegal argument error {}.", message) + )] + IllegalArgument { message: String }, - #[error("Write error: {0}")] - WriteError(String), + #[snafu( + visibility(pub(crate)), + display("Fluss hitting IO not supported error {}.", message) + )] + IoUnsupported { message: String }, - #[error("Illegal argument error: {0}")] - IllegalArgument(String), + #[snafu( + visibility(pub(crate)), + display("Fluss hitting leader not available error {}.", message) + )] + LeaderNotAvailable { message: String }, - #[error("IO not supported error: {0}")] - IoUnsupported(String), + #[snafu(visibility(pub(crate)), display("Fluss API Error: {}.", api_error))] + FlussAPIError { api_error: ApiError }, +} - #[error("IO operation failed on underlying storage: {0}")] - IoUnexpected(Box), +impl From for Error { + fn from(value: ArrowError) -> Self { + Error::ArrowError { + message: format!("{value}"), + source: value, + } + } +} + +impl From for Error { + fn from(value: RpcError) -> Self { + Error::RpcError { + message: format!("{value}"), + source: value, + } + } +} + +impl From for Error { + fn from(value: io::Error) -> Self { + Error::IoUnexpectedError { + message: format!("{value}"), + source: value, + } + } } impl From for Error { - fn from(err: opendal::Error) -> Self { - Error::IoUnexpected(Box::new(err)) + fn from(value: opendal::Error) -> Self { + Error::RemoteStorageUnexpectedError { + message: format!("{value}"), + source: value, + } + } +} + +impl From for Error { + fn from(value: ApiError) -> Self { + Error::FlussAPIError { api_error: value } } } diff --git a/fluss-rust/crates/fluss/src/io/file_io.rs b/fluss-rust/crates/fluss/src/io/file_io.rs index ec3b87ec5b..e7b026df55 100644 --- a/fluss-rust/crates/fluss/src/io/file_io.rs +++ b/fluss-rust/crates/fluss/src/io/file_io.rs @@ -39,8 +39,9 @@ pub struct FileIO { impl FileIO { /// Try to infer file io scheme from path. pub fn from_url(path: &str) -> Result { - let url = - Url::parse(path).map_err(|_| Error::IllegalArgument(format!("Invalid URL: {path}")))?; + let url = Url::parse(path).map_err(|_| Error::IllegalArgument { + message: format!("Invalid URL: {path}"), + })?; Ok(FileIOBuilder::new(url.scheme())) } diff --git a/fluss-rust/crates/fluss/src/io/storage.rs b/fluss-rust/crates/fluss/src/io/storage.rs index 089670e24c..d90eaa5711 100644 --- a/fluss-rust/crates/fluss/src/io/storage.rs +++ b/fluss-rust/crates/fluss/src/io/storage.rs @@ -44,9 +44,9 @@ impl Storage { Scheme::Fs => Ok(Self::LocalFs), #[cfg(feature = "storage-s3")] Scheme::S3 => Ok(Self::S3 { props }), - _ => Err(error::Error::IoUnsupported( - "Unsupported storage feature".to_string(), - )), + _ => Err(error::Error::IoUnsupported { + message: format!("Unsupported storage feature {scheme_str}"), + }), } } diff --git a/fluss-rust/crates/fluss/src/metadata/database.rs b/fluss-rust/crates/fluss/src/metadata/database.rs index 8eaa4d3eb0..fad1498f58 100644 --- a/fluss-rust/crates/fluss/src/metadata/database.rs +++ b/fluss-rust/crates/fluss/src/metadata/database.rs @@ -148,8 +148,8 @@ impl JsonSerde for DatabaseDescriptor { if let Some(comment_node) = node.get(Self::COMMENT_NAME) { let comment = comment_node .as_str() - .ok_or_else(|| { - JsonSerdeError(format!("{} should be a string", Self::COMMENT_NAME)) + .ok_or_else(|| JsonSerdeError { + message: format!("{} should be a string", Self::COMMENT_NAME), })? .to_owned(); builder = builder.comment(&comment); @@ -157,8 +157,8 @@ impl JsonSerde for DatabaseDescriptor { // Deserialize custom properties directly let custom_properties = if let Some(props_node) = node.get(Self::CUSTOM_PROPERTIES_NAME) { - let obj = props_node.as_object().ok_or_else(|| { - JsonSerdeError("Custom properties should be an object".to_string()) + let obj = props_node.as_object().ok_or_else(|| JsonSerdeError { + message: "Custom properties should be an object".to_string(), })?; let mut properties = HashMap::with_capacity(obj.len()); @@ -167,8 +167,8 @@ impl JsonSerde for DatabaseDescriptor { key.clone(), value .as_str() - .ok_or_else(|| { - JsonSerdeError("Property value should be a string".to_string()) + .ok_or_else(|| JsonSerdeError { + message: "Property value should be a string".to_string(), })? .to_owned(), ); @@ -186,16 +186,18 @@ impl JsonSerde for DatabaseDescriptor { impl DatabaseDescriptor { /// Create DatabaseDescriptor from JSON bytes (equivalent to Java's fromJsonBytes) pub fn from_json_bytes(bytes: &[u8]) -> Result { - let json_value: Value = serde_json::from_slice(bytes) - .map_err(|e| JsonSerdeError(format!("Failed to parse JSON: {e}")))?; + let json_value: Value = serde_json::from_slice(bytes).map_err(|e| JsonSerdeError { + message: format!("Failed to parse JSON: {e}"), + })?; Self::deserialize_json(&json_value) } /// Convert DatabaseDescriptor to JSON bytes pub fn to_json_bytes(&self) -> Result> { let json_value = self.serialize_json()?; - serde_json::to_vec(&json_value) - .map_err(|e| JsonSerdeError(format!("Failed to serialize to JSON: {e}"))) + serde_json::to_vec(&json_value).map_err(|e| JsonSerdeError { + message: format!("Failed to serialize to JSON: {e}"), + }) } } diff --git a/fluss-rust/crates/fluss/src/metadata/json_serde.rs b/fluss-rust/crates/fluss/src/metadata/json_serde.rs index 447b0f9ff3..7d94e194e2 100644 --- a/fluss-rust/crates/fluss/src/metadata/json_serde.rs +++ b/fluss-rust/crates/fluss/src/metadata/json_serde.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::error::Error::{InvalidTableError, JsonSerdeError}; -use crate::error::Result; +use crate::error::Error::JsonSerdeError; +use crate::error::{Error, Result}; use crate::metadata::datatype::{DataField, DataType, DataTypes}; use crate::metadata::table::{Column, Schema, TableDescriptor}; use serde_json::{Value, json}; @@ -166,11 +166,11 @@ impl JsonSerde for DataType { let type_root = node .get(Self::FIELD_NAME_TYPE_NAME) .and_then(|v| v.as_str()) - .ok_or_else(|| { - JsonSerdeError(format!( + .ok_or_else(|| Error::JsonSerdeError { + message: format!( "Couldn't find field {} while deserializing datatype.", Self::FIELD_NAME_TYPE_NAME - )) + ), })?; let mut data_type = match type_root { @@ -185,11 +185,8 @@ impl JsonSerde for DataType { let length = node .get(Self::FIELD_NAME_LENGTH) .and_then(|v| v.as_u64()) - .ok_or_else(|| { - JsonSerdeError(format!( - "Missing required field: {}", - Self::FIELD_NAME_LENGTH - )) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::FIELD_NAME_LENGTH), })? as u32; DataTypes::char(length) } @@ -198,11 +195,8 @@ impl JsonSerde for DataType { let precision = node .get(Self::FIELD_NAME_PRECISION) .and_then(|v| v.as_u64()) - .ok_or_else(|| { - JsonSerdeError(format!( - "Missing required field: {}", - Self::FIELD_NAME_PRECISION - )) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::FIELD_NAME_PRECISION), })? as u32; let scale = node .get(Self::FIELD_NAME_SCALE) @@ -243,43 +237,46 @@ impl JsonSerde for DataType { "ARRAY" => { let element_type_node = node.get(Self::FIELD_NAME_ELEMENT_TYPE).ok_or_else(|| { - JsonSerdeError(format!( - "Missing required field: {}", - Self::FIELD_NAME_ELEMENT_TYPE - )) + Error::JsonSerdeError { + message: format!( + "Missing required field: {}", + Self::FIELD_NAME_ELEMENT_TYPE + ), + } })?; let element_type = DataType::deserialize_json(element_type_node)?; DataTypes::array(element_type) } "MAP" => { - let key_type_node = node.get(Self::FIELD_NAME_KEY_TYPE).ok_or_else(|| { - JsonSerdeError(format!( - "Missing required field: {}", - Self::FIELD_NAME_KEY_TYPE - )) - })?; + let key_type_node = + node.get(Self::FIELD_NAME_KEY_TYPE) + .ok_or_else(|| Error::JsonSerdeError { + message: format!( + "Missing required field: {}", + Self::FIELD_NAME_KEY_TYPE + ), + })?; let key_type = DataType::deserialize_json(key_type_node)?; - let value_type_node = node.get(Self::FIELD_NAME_VALUE_TYPE).ok_or_else(|| { - JsonSerdeError(format!( - "Missing required field: {}", - Self::FIELD_NAME_VALUE_TYPE - )) - })?; + let value_type_node = + node.get(Self::FIELD_NAME_VALUE_TYPE) + .ok_or_else(|| Error::JsonSerdeError { + message: format!( + "Missing required field: {}", + Self::FIELD_NAME_VALUE_TYPE + ), + })?; let value_type = DataType::deserialize_json(value_type_node)?; DataTypes::map(key_type, value_type) } "ROW" => { let fields_node = node .get(Self::FIELD_NAME_FIELDS) - .ok_or_else(|| { - JsonSerdeError(format!( - "Missing required field: {}", - Self::FIELD_NAME_FIELDS - )) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::FIELD_NAME_FIELDS), })? .as_array() - .ok_or_else(|| { - JsonSerdeError(format!("{} must be an array", Self::FIELD_NAME_FIELDS)) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("{} must be an array", Self::FIELD_NAME_FIELDS), })?; let mut fields = Vec::with_capacity(fields_node.len()); for field_node in fields_node { @@ -287,7 +284,11 @@ impl JsonSerde for DataType { } DataTypes::row(fields) } - _ => return Err(JsonSerdeError(format!("Unknown type root: {type_root}"))), + _ => { + return Err(Error::JsonSerdeError { + message: format!("Unknown type root: {type_root}"), + }); + } }; if let Some(nullable) = node.get(Self::FIELD_NAME_NULLABLE) { @@ -327,12 +328,16 @@ impl JsonSerde for DataField { let name = node .get(Self::NAME) .and_then(|v| v.as_str()) - .ok_or_else(|| JsonSerdeError(format!("Missing required field: {}", Self::NAME)))? + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::NAME), + })? .to_string(); - let field_type_node = node.get(Self::FIELD_TYPE).ok_or_else(|| { - JsonSerdeError(format!("Missing required field: {}", Self::FIELD_TYPE)) - })?; + let field_type_node = node + .get(Self::FIELD_TYPE) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::FIELD_TYPE), + })?; let data_type = DataType::deserialize_json(field_type_node)?; @@ -373,12 +378,16 @@ impl JsonSerde for Column { let name = node .get(Self::NAME) .and_then(|v| v.as_str()) - .ok_or_else(|| JsonSerdeError(format!("Missing required field: {}", Self::NAME)))? + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::NAME), + })? .to_string(); - let data_type_node = node.get(Self::DATA_TYPE).ok_or_else(|| { - JsonSerdeError(format!("Missing required field: {}", Self::DATA_TYPE)) - })?; + let data_type_node = node + .get(Self::DATA_TYPE) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::DATA_TYPE), + })?; let data_type = DataType::deserialize_json(data_type_node)?; @@ -429,11 +438,13 @@ impl JsonSerde for Schema { fn deserialize_json(node: &Value) -> Result { let columns_node = node .get(Self::COLUMNS_NAME) - .ok_or_else(|| { - JsonSerdeError(format!("Missing required field: {}", Self::COLUMNS_NAME)) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::COLUMNS_NAME), })? .as_array() - .ok_or_else(|| JsonSerdeError(format!("{} must be an array", Self::COLUMNS_NAME)))?; + .ok_or_else(|| Error::JsonSerdeError { + message: format!("{} must be an array", Self::COLUMNS_NAME), + })?; let mut columns = Vec::with_capacity(columns_node.len()); for col_node in columns_node { @@ -443,17 +454,17 @@ impl JsonSerde for Schema { let mut schema_builder = Schema::builder().with_columns(columns); if let Some(pk_node) = node.get(Self::PRIMARY_KEY_NAME) { - let pk_array = pk_node - .as_array() - .ok_or_else(|| InvalidTableError("Primary key must be an array".to_string()))?; + let pk_array = pk_node.as_array().ok_or_else(|| Error::InvalidTableError { + message: "Primary key must be an array".to_string(), + })?; let mut primary_keys = Vec::with_capacity(pk_array.len()); for name_node in pk_array { primary_keys.push( name_node .as_str() - .ok_or_else(|| { - InvalidTableError("Primary key element must be a string".to_string()) + .ok_or_else(|| Error::InvalidTableError { + message: "Primary key element must be a string".to_string(), })? .to_string(), ); @@ -478,9 +489,9 @@ impl TableDescriptor { const VERSION: u32 = 1; fn deserialize_properties(node: &Value) -> Result> { - let obj = node - .as_object() - .ok_or_else(|| JsonSerdeError("Properties must be an object".to_string()))?; + let obj = node.as_object().ok_or_else(|| Error::JsonSerdeError { + message: "Properties must be an object".to_string(), + })?; let mut properties = HashMap::with_capacity(obj.len()); for (key, value) in obj { @@ -488,7 +499,9 @@ impl TableDescriptor { key.clone(), value .as_str() - .ok_or_else(|| JsonSerdeError("Property value must be a string".to_string()))? + .ok_or_else(|| Error::JsonSerdeError { + message: "Property value must be a string".to_string(), + })? .to_owned(), ); } @@ -545,8 +558,8 @@ impl JsonSerde for TableDescriptor { let mut builder = TableDescriptor::builder(); // Deserialize schema - let schema_node = node.get(Self::SCHEMA_NAME).ok_or_else(|| { - JsonSerdeError(format!("Missing required field: {}", Self::SCHEMA_NAME)) + let schema_node = node.get(Self::SCHEMA_NAME).ok_or_else(|| JsonSerdeError { + message: format!("Missing required field: {}", Self::SCHEMA_NAME), })?; let schema = Schema::deserialize_json(schema_node)?; builder = builder.schema(schema); @@ -555,22 +568,21 @@ impl JsonSerde for TableDescriptor { if let Some(comment_node) = node.get(Self::COMMENT_NAME) { let comment = comment_node .as_str() - .ok_or_else(|| JsonSerdeError(format!("{} must be a string", Self::COMMENT_NAME)))? + .ok_or_else(|| JsonSerdeError { + message: format!("{} must be a string", Self::COMMENT_NAME), + })? .to_owned(); builder = builder.comment(comment.as_str()); } let partition_node = node .get(Self::PARTITION_KEY_NAME) - .ok_or_else(|| { - JsonSerdeError(format!( - "Missing required field: {}", - Self::PARTITION_KEY_NAME - )) + .ok_or_else(|| JsonSerdeError { + message: format!("Missing required field: {}", Self::PARTITION_KEY_NAME), })? .as_array() - .ok_or_else(|| { - JsonSerdeError(format!("{} must be an array", Self::PARTITION_KEY_NAME)) + .ok_or_else(|| JsonSerdeError { + message: format!("{} must be an array", Self::PARTITION_KEY_NAME), })?; let mut partition_keys = Vec::with_capacity(partition_node.len()); @@ -578,11 +590,8 @@ impl JsonSerde for TableDescriptor { partition_keys.push( key_node .as_str() - .ok_or_else(|| { - JsonSerdeError(format!( - "{} element must be a string", - Self::PARTITION_KEY_NAME - )) + .ok_or_else(|| JsonSerdeError { + message: format!("{} element must be a string", Self::PARTITION_KEY_NAME), })? .to_owned(), ); @@ -592,15 +601,17 @@ impl JsonSerde for TableDescriptor { let mut bucket_count = None; let mut bucket_keys = vec![]; if let Some(bucket_key_node) = node.get(Self::BUCKET_KEY_NAME) { - let bucket_key_node = bucket_key_node.as_array().ok_or_else(|| { - JsonSerdeError(format!("{} must be an array", Self::BUCKET_KEY_NAME)) + let bucket_key_node = bucket_key_node.as_array().ok_or_else(|| JsonSerdeError { + message: format!("{} must be an array", Self::BUCKET_KEY_NAME), })?; for key_node in bucket_key_node { bucket_keys.push( key_node .as_str() - .ok_or_else(|| JsonSerdeError("Bucket key must be a string".to_string()))? + .ok_or_else(|| JsonSerdeError { + message: "Bucket key must be a string".to_string(), + })? .to_owned(), ); } @@ -617,18 +628,18 @@ impl JsonSerde for TableDescriptor { // Deserialize properties let properties = Self::deserialize_properties(node.get(Self::PROPERTIES_NAME).ok_or_else(|| { - JsonSerdeError(format!("Missing required field: {}", Self::PROPERTIES_NAME)) + JsonSerdeError { + message: format!("Missing required field: {}", Self::PROPERTIES_NAME), + } })?)?; builder = builder.properties(properties); // Deserialize custom properties let custom_properties = Self::deserialize_properties( - node.get(Self::CUSTOM_PROPERTIES_NAME).ok_or_else(|| { - JsonSerdeError(format!( - "Missing required field: {}", - Self::CUSTOM_PROPERTIES_NAME - )) - })?, + node.get(Self::CUSTOM_PROPERTIES_NAME) + .ok_or_else(|| JsonSerdeError { + message: format!("Missing required field: {}", Self::CUSTOM_PROPERTIES_NAME), + })?, )?; builder = builder.custom_properties(custom_properties); diff --git a/fluss-rust/crates/fluss/src/metadata/table.rs b/fluss-rust/crates/fluss/src/metadata/table.rs index 751dd6da02..770c4f2cfe 100644 --- a/fluss-rust/crates/fluss/src/metadata/table.rs +++ b/fluss-rust/crates/fluss/src/metadata/table.rs @@ -16,7 +16,7 @@ // under the License. use crate::error::Error::InvalidTableError; -use crate::error::Result; +use crate::error::{Error, Result}; use crate::metadata::datatype::{DataField, DataType, RowType}; use core::fmt; use serde::{Deserialize, Serialize}; @@ -220,9 +220,9 @@ impl SchemaBuilder { ) -> Result> { let names: Vec<_> = columns.iter().map(|c| &c.name).collect(); if let Some(duplicates) = Self::find_duplicates(&names) { - return Err(InvalidTableError(format!( - "Duplicate column names found: {duplicates:?}" - ))); + return Err(InvalidTableError { + message: format!("Duplicate column names found: {duplicates:?}"), + }); } let Some(pk) = primary_key else { @@ -232,9 +232,9 @@ impl SchemaBuilder { let pk_set: HashSet<_> = pk.column_names.iter().collect(); let all_columns: HashSet<_> = columns.iter().map(|c| &c.name).collect(); if !pk_set.is_subset(&all_columns) { - return Err(InvalidTableError(format!( - "Primary key columns {pk_set:?} not found in schema" - ))); + return Err(InvalidTableError { + message: format!("Primary key columns {pk_set:?} not found in schema"), + }); } Ok(columns @@ -441,12 +441,12 @@ impl TableDescriptor { pub fn replication_factor(&self) -> Result { self.properties .get("table.replication.factor") - .ok_or(InvalidTableError( - "Replication factor is not set".to_string(), - ))? + .ok_or_else(|| InvalidTableError { + message: "Replication factor is not set".to_string(), + })? .parse() - .map_err(|_e| { - InvalidTableError("Replication factor can't be convert into int".to_string()) + .map_err(|_e| InvalidTableError { + message: "Replication factor can't be convert into int".to_string(), }) } @@ -497,11 +497,13 @@ impl TableDescriptor { bucket_keys.retain(|k| !partition_keys.contains(k)); if bucket_keys.is_empty() { - return Err(InvalidTableError(format!( - "Primary Key constraint {:?} should not be same with partition fields {:?}.", - schema.primary_key().unwrap().column_names(), - partition_keys - ))); + return Err(Error::InvalidTableError { + message: format!( + "Primary Key constraint {:?} should not be same with partition fields {:?}.", + schema.primary_key().unwrap().column_names(), + partition_keys + ), + }); } Ok(bucket_keys) @@ -518,10 +520,12 @@ impl TableDescriptor { .iter() .any(|k| partition_keys.contains(k)) { - return Err(InvalidTableError(format!( - "Bucket key {:?} shouldn't include any column in partition keys {:?}.", - distribution.bucket_keys, partition_keys - ))); + return Err(InvalidTableError { + message: format!( + "Bucket key {:?} shouldn't include any column in partition keys {:?}.", + distribution.bucket_keys, partition_keys + ), + }); } return if let Some(pk) = schema.primary_key() { @@ -540,13 +544,15 @@ impl TableDescriptor { .iter() .all(|k| pk_columns.contains(k)) { - return Err(InvalidTableError(format!( - "Bucket keys must be a subset of primary keys excluding partition keys for primary-key tables. \ - The primary keys are {:?}, the partition keys are {:?}, but the user-defined bucket keys are {:?}.", - pk.column_names(), - partition_keys, - distribution.bucket_keys - ))); + return Err(InvalidTableError { + message: format!( + "Bucket keys must be a subset of primary keys excluding partition keys for primary-key tables. \ + The primary keys are {:?}, the partition keys are {:?}, but the user-defined bucket keys are {:?}.", + pk.column_names(), + partition_keys, + distribution.bucket_keys + ), + }); } Ok(Some(distribution)) } @@ -589,7 +595,9 @@ impl LogFormat { match s.to_uppercase().as_str() { "ARROW" => Ok(LogFormat::ARROW), "INDEXED" => Ok(LogFormat::INDEXED), - _ => Err(InvalidTableError(format!("Unknown log format: {s}"))), + _ => Err(InvalidTableError { + message: format!("Unknown log format: {s}"), + }), } } } @@ -615,7 +623,9 @@ impl KvFormat { match s.to_uppercase().as_str() { "INDEXED" => Ok(KvFormat::INDEXED), "COMPACTED" => Ok(KvFormat::COMPACTED), - _ => Err(InvalidTableError(format!("Unknown kv format: {s}"))), + _ => Err(Error::InvalidTableError { + message: format!("Unknown kv format: {s}"), + }), } } } @@ -961,6 +971,24 @@ impl TableBucket { } } +impl Display for TableBucket { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(partition_id) = self.partition_id { + write!( + f, + "TableBucket(table_id={}, partition_id={}, bucket={})", + self.table_id, partition_id, self.bucket + ) + } else { + write!( + f, + "TableBucket(table_id={}, bucket={})", + self.table_id, self.bucket + ) + } + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct LakeSnapshot { pub snapshot_id: i64, diff --git a/fluss-rust/crates/fluss/src/proto/fluss_api.proto b/fluss-rust/crates/fluss/src/proto/fluss_api.proto index e59c2d9dd6..dbbb45daea 100644 --- a/fluss-rust/crates/fluss/src/proto/fluss_api.proto +++ b/fluss-rust/crates/fluss/src/proto/fluss_api.proto @@ -19,6 +19,11 @@ syntax = "proto2"; package proto; +message ErrorResponse { + required int32 error_code = 1; + optional string error_message = 2; +} + // metadata request and response, request send from client to each server. message MetadataRequest { repeated PbTablePath table_path = 1; diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index 6e8cb55962..9295713cc2 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -844,10 +844,7 @@ impl ReadContext { } pub fn record_batch_for_remote_log(&self, data: &[u8]) -> Result> { - let (batch_metadata, body_buffer, version) = match parse_ipc_message(data) { - Some(result) => result, - None => return Ok(None), - }; + let (batch_metadata, body_buffer, version) = parse_ipc_message(data)?; let record_batch = read_record_batch( &body_buffer, @@ -1086,13 +1083,17 @@ mod tests { let result = parse_ipc_message(empty_body); assert_eq!( result.unwrap_err().to_string(), - String::from("Arrow error: Parser error: Range [0, 4) is out of bounds.\n\n") + String::from( + "Fluss hitting Arrow error Parser error: Range [0, 4) is out of bounds.\n\n: ParseError(\"Range [0, 4) is out of bounds.\\n\\n\")." + ) ); let invalid_data = &[]; assert_eq!( parse_ipc_message(invalid_data).unwrap_err().to_string(), - String::from("Arrow error: Parser error: Invalid data length: 0") + String::from( + "Fluss hitting Arrow error Parser error: Invalid data length: 0: ParseError(\"Invalid data length: 0\")." + ) ); let data_with_invalid_continuation: &[u8] = &le_bytes(&[0x00000001, 0x00000000]); @@ -1100,7 +1101,9 @@ mod tests { parse_ipc_message(data_with_invalid_continuation) .unwrap_err() .to_string(), - String::from("Arrow error: Parser error: Invalid continuation marker: 1") + String::from( + "Fluss hitting Arrow error Parser error: Invalid continuation marker: 1: ParseError(\"Invalid continuation marker: 1\")." + ) ); let data_with_invalid_length: &[u8] = &le_bytes(&[0xFFFFFFFF, 0x00000001]); @@ -1109,8 +1112,7 @@ mod tests { .unwrap_err() .to_string(), String::from( - "Arrow error: Parser error: Invalid data length. \ - Remaining data length 0 is shorter than specified size 1" + "Fluss hitting Arrow error Parser error: Invalid data length. Remaining data length 0 is shorter than specified size 1: ParseError(\"Invalid data length. Remaining data length 0 is shorter than specified size 1\")." ) ); @@ -1119,7 +1121,9 @@ mod tests { parse_ipc_message(data_with_invalid_length) .unwrap_err() .to_string(), - String::from("Arrow error: Parser error: Not a record batch") + String::from( + "Fluss hitting Arrow error Parser error: Not a record batch: ParseError(\"Not a record batch\")." + ) ); } diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs index 6929b57c5c..1ea393349e 100644 --- a/fluss-rust/crates/fluss/src/row/datum.rs +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -290,18 +290,22 @@ impl Datum<'_> { Datum::String(v) => append_value_to_arrow!(StringBuilder, *v), Datum::Blob(v) => append_value_to_arrow!(BinaryBuilder, v.as_ref()), Datum::Decimal(_) | Datum::Date(_) | Datum::Timestamp(_) | Datum::TimestampTz(_) => { - return Err(RowConvertError(format!( - "Type {:?} is not yet supported for Arrow conversion", - std::mem::discriminant(self) - ))); + return Err(RowConvertError { + message: format!( + "Type {:?} is not yet supported for Arrow conversion", + std::mem::discriminant(self) + ), + }); } } - Err(RowConvertError(format!( - "Cannot append {:?} to builder of type {}", - self, - std::any::type_name_of_val(builder) - ))) + Err(RowConvertError { + message: format!( + "Cannot append {:?} to builder of type {}", + self, + std::any::type_name_of_val(builder) + ), + }) } } @@ -313,11 +317,13 @@ macro_rules! impl_to_arrow { b.append_value(*self); Ok(()) } else { - Err(RowConvertError(format!( - "Cannot cast {} to {} builder", - stringify!($ty), - stringify!($variant) - ))) + Err(RowConvertError { + message: format!( + "Cannot cast {} to {} builder", + stringify!($ty), + stringify!($variant) + ), + }) } } } diff --git a/fluss-rust/crates/fluss/src/rpc/error.rs b/fluss-rust/crates/fluss/src/rpc/error.rs index 84b20b102e..da3a11e295 100644 --- a/fluss-rust/crates/fluss/src/rpc/error.rs +++ b/fluss-rust/crates/fluss/src/rpc/error.rs @@ -17,6 +17,7 @@ use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; +use prost::DecodeError; use std::sync::Arc; use thiserror::Error; @@ -29,6 +30,9 @@ pub enum RpcError { #[error("Cannot read framed message: {0}")] ReadMessageError(#[from] crate::rpc::frame::ReadError), + #[error("Rpc Decode Error: {0}")] + RpcDecodeError(#[from] DecodeError), + #[error("connection error")] ConnectionError(String), diff --git a/fluss-rust/crates/fluss/src/rpc/fluss_api_error.rs b/fluss-rust/crates/fluss/src/rpc/fluss_api_error.rs new file mode 100644 index 0000000000..b26eb72f61 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/fluss_api_error.rs @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::proto::ErrorResponse; +use std::fmt::{Debug, Display, Formatter}; + +/// API error response from Fluss server +pub struct ApiError { + pub code: i32, + pub message: String, +} + +impl Debug for ApiError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ApiError") + .field("code", &self.code) + .field("message", &self.message) + .finish() + } +} + +impl Display for ApiError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + Debug::fmt(self, f) + } +} + +/// Fluss protocol errors. These errors are part of the client-server protocol. +/// The error codes cannot be changed, but the names can be. +/// +/// Do not add exceptions that occur only on the client or only on the server here. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[repr(i32)] +pub enum FlussError { + /// The server experienced an unexpected error when processing the request. + UnknownServerError = -1, + /// No error occurred. + None = 0, + /// The server disconnected before a response was received. + NetworkException = 1, + /// The version of API is not supported. + UnsupportedVersion = 2, + /// This message has failed its CRC checksum, exceeds the valid size, has a null key for a primary key table, or is otherwise corrupt. + CorruptMessage = 3, + /// The database does not exist. + DatabaseNotExist = 4, + /// The database is not empty. + DatabaseNotEmpty = 5, + /// The database already exists. + DatabaseAlreadyExist = 6, + /// The table does not exist. + TableNotExist = 7, + /// The table already exists. + TableAlreadyExist = 8, + /// The schema does not exist. + SchemaNotExist = 9, + /// Exception occur while storage data for log in server. + LogStorageException = 10, + /// Exception occur while storage data for kv in server. + KvStorageException = 11, + /// Not leader or follower. + NotLeaderOrFollower = 12, + /// The record is too large. + RecordTooLargeException = 13, + /// The record is corrupt. + CorruptRecordException = 14, + /// The client has attempted to perform an operation on an invalid table. + InvalidTableException = 15, + /// The client has attempted to perform an operation on an invalid database. + InvalidDatabaseException = 16, + /// The replication factor is larger then the number of available tablet servers. + InvalidReplicationFactor = 17, + /// Produce request specified an invalid value for required acks. + InvalidRequiredAcks = 18, + /// The log offset is out of range. + LogOffsetOutOfRangeException = 19, + /// The table is not primary key table. + NonPrimaryKeyTableException = 20, + /// The table or bucket does not exist. + UnknownTableOrBucketException = 21, + /// The update version is invalid. + InvalidUpdateVersionException = 22, + /// The coordinator is invalid. + InvalidCoordinatorException = 23, + /// The leader epoch is invalid. + FencedLeaderEpochException = 24, + /// The request time out. + RequestTimeOut = 25, + /// The general storage exception. + StorageException = 26, + /// The server did not attempt to execute this operation. + OperationNotAttemptedException = 27, + /// Records are written to the server already, but to fewer in-sync replicas than required. + NotEnoughReplicasAfterAppendException = 28, + /// Messages are rejected since there are fewer in-sync replicas than required. + NotEnoughReplicasException = 29, + /// Get file access security token exception. + SecurityTokenException = 30, + /// The tablet server received an out of order sequence batch. + OutOfOrderSequenceException = 31, + /// The tablet server received a duplicate sequence batch. + DuplicateSequenceException = 32, + /// This exception is raised by the tablet server if it could not locate the writer metadata. + UnknownWriterIdException = 33, + /// The requested column projection is invalid. + InvalidColumnProjection = 34, + /// The requested target column to write is invalid. + InvalidTargetColumn = 35, + /// The partition does not exist. + PartitionNotExists = 36, + /// The table is not partitioned. + TableNotPartitionedException = 37, + /// The timestamp is invalid. + InvalidTimestampException = 38, + /// The config is invalid. + InvalidConfigException = 39, + /// The lake storage is not configured. + LakeStorageNotConfiguredException = 40, + /// The kv snapshot is not exist. + KvSnapshotNotExist = 41, + /// The partition already exists. + PartitionAlreadyExists = 42, + /// The partition spec is invalid. + PartitionSpecInvalidException = 43, + /// There is no currently available leader for the given partition. + LeaderNotAvailableException = 44, + /// Exceed the maximum number of partitions. + PartitionMaxNumException = 45, + /// Authentication failed. + AuthenticateException = 46, + /// Security is disabled. + SecurityDisabledException = 47, + /// Authorization failed. + AuthorizationException = 48, + /// Exceed the maximum number of buckets. + BucketMaxNumException = 49, + /// The tiering epoch is invalid. + FencedTieringEpochException = 50, + /// Authentication failed with retriable exception. + RetriableAuthenticateException = 51, + /// The server rack info is invalid. + InvalidServerRackInfoException = 52, + /// The lake snapshot is not exist. + LakeSnapshotNotExist = 53, + /// The lake table already exists. + LakeTableAlreadyExist = 54, + /// The new ISR contains at least one ineligible replica. + IneligibleReplicaException = 55, + /// The alter table is invalid. + InvalidAlterTableException = 56, + /// Deletion operations are disabled on this table. + DeletionDisabledException = 57, +} + +impl FlussError { + /// Returns the error code for this error. + pub fn code(&self) -> i32 { + *self as i32 + } + + /// Returns a friendly description of the error. + pub fn message(&self) -> &'static str { + match self { + FlussError::UnknownServerError => { + "The server experienced an unexpected error when processing the request." + } + FlussError::None => "No error", + FlussError::NetworkException => { + "The server disconnected before a response was received." + } + FlussError::UnsupportedVersion => "The version of API is not supported.", + FlussError::CorruptMessage => { + "This message has failed its CRC checksum, exceeds the valid size, has a null key for a primary key table, or is otherwise corrupt." + } + FlussError::DatabaseNotExist => "The database does not exist.", + FlussError::DatabaseNotEmpty => "The database is not empty.", + FlussError::DatabaseAlreadyExist => "The database already exists.", + FlussError::TableNotExist => "The table does not exist.", + FlussError::TableAlreadyExist => "The table already exists.", + FlussError::SchemaNotExist => "The schema does not exist.", + FlussError::LogStorageException => { + "Exception occur while storage data for log in server." + } + FlussError::KvStorageException => { + "Exception occur while storage data for kv in server." + } + FlussError::NotLeaderOrFollower => "Not leader or follower.", + FlussError::RecordTooLargeException => "The record is too large.", + FlussError::CorruptRecordException => "The record is corrupt.", + FlussError::InvalidTableException => { + "The client has attempted to perform an operation on an invalid table." + } + FlussError::InvalidDatabaseException => { + "The client has attempted to perform an operation on an invalid database." + } + FlussError::InvalidReplicationFactor => { + "The replication factor is larger then the number of available tablet servers." + } + FlussError::InvalidRequiredAcks => { + "Produce request specified an invalid value for required acks." + } + FlussError::LogOffsetOutOfRangeException => "The log offset is out of range.", + FlussError::NonPrimaryKeyTableException => "The table is not primary key table.", + FlussError::UnknownTableOrBucketException => "The table or bucket does not exist.", + FlussError::InvalidUpdateVersionException => "The update version is invalid.", + FlussError::InvalidCoordinatorException => "The coordinator is invalid.", + FlussError::FencedLeaderEpochException => "The leader epoch is invalid.", + FlussError::RequestTimeOut => "The request time out.", + FlussError::StorageException => "The general storage exception.", + FlussError::OperationNotAttemptedException => { + "The server did not attempt to execute this operation." + } + FlussError::NotEnoughReplicasAfterAppendException => { + "Records are written to the server already, but to fewer in-sync replicas than required." + } + FlussError::NotEnoughReplicasException => { + "Messages are rejected since there are fewer in-sync replicas than required." + } + FlussError::SecurityTokenException => "Get file access security token exception.", + FlussError::OutOfOrderSequenceException => { + "The tablet server received an out of order sequence batch." + } + FlussError::DuplicateSequenceException => { + "The tablet server received a duplicate sequence batch." + } + FlussError::UnknownWriterIdException => { + "This exception is raised by the tablet server if it could not locate the writer metadata." + } + FlussError::InvalidColumnProjection => "The requested column projection is invalid.", + FlussError::InvalidTargetColumn => "The requested target column to write is invalid.", + FlussError::PartitionNotExists => "The partition does not exist.", + FlussError::TableNotPartitionedException => "The table is not partitioned.", + FlussError::InvalidTimestampException => "The timestamp is invalid.", + FlussError::InvalidConfigException => "The config is invalid.", + FlussError::LakeStorageNotConfiguredException => "The lake storage is not configured.", + FlussError::KvSnapshotNotExist => "The kv snapshot does not exist.", + FlussError::PartitionAlreadyExists => "The partition already exists.", + FlussError::PartitionSpecInvalidException => "The partition spec is invalid.", + FlussError::LeaderNotAvailableException => { + "There is no currently available leader for the given partition." + } + FlussError::PartitionMaxNumException => "Exceed the maximum number of partitions.", + FlussError::AuthenticateException => "Authentication failed.", + FlussError::SecurityDisabledException => "Security is disabled.", + FlussError::AuthorizationException => "Authorization failed.", + FlussError::BucketMaxNumException => "Exceed the maximum number of buckets.", + FlussError::FencedTieringEpochException => "The tiering epoch is invalid.", + FlussError::RetriableAuthenticateException => { + "Authentication failed with retriable exception." + } + FlussError::InvalidServerRackInfoException => "The server rack info is invalid.", + FlussError::LakeSnapshotNotExist => "The lake snapshot does not exist.", + FlussError::LakeTableAlreadyExist => "The lake table already exists.", + FlussError::IneligibleReplicaException => { + "The new ISR contains at least one ineligible replica." + } + FlussError::InvalidAlterTableException => "The alter table is invalid.", + FlussError::DeletionDisabledException => { + "Deletion operations are disabled on this table." + } + } + } + + /// Create an ApiError from this error with the default message. + pub fn to_api_error(&self, message: Option) -> ApiError { + ApiError { + code: self.code(), + message: message.unwrap_or(self.message().to_string()), + } + } + + /// Get the FlussError for the given error code. + /// Returns `UnknownServerError` if the code is not recognized. + pub fn for_code(code: i32) -> Self { + match code { + -1 => FlussError::UnknownServerError, + 0 => FlussError::None, + 1 => FlussError::NetworkException, + 2 => FlussError::UnsupportedVersion, + 3 => FlussError::CorruptMessage, + 4 => FlussError::DatabaseNotExist, + 5 => FlussError::DatabaseNotEmpty, + 6 => FlussError::DatabaseAlreadyExist, + 7 => FlussError::TableNotExist, + 8 => FlussError::TableAlreadyExist, + 9 => FlussError::SchemaNotExist, + 10 => FlussError::LogStorageException, + 11 => FlussError::KvStorageException, + 12 => FlussError::NotLeaderOrFollower, + 13 => FlussError::RecordTooLargeException, + 14 => FlussError::CorruptRecordException, + 15 => FlussError::InvalidTableException, + 16 => FlussError::InvalidDatabaseException, + 17 => FlussError::InvalidReplicationFactor, + 18 => FlussError::InvalidRequiredAcks, + 19 => FlussError::LogOffsetOutOfRangeException, + 20 => FlussError::NonPrimaryKeyTableException, + 21 => FlussError::UnknownTableOrBucketException, + 22 => FlussError::InvalidUpdateVersionException, + 23 => FlussError::InvalidCoordinatorException, + 24 => FlussError::FencedLeaderEpochException, + 25 => FlussError::RequestTimeOut, + 26 => FlussError::StorageException, + 27 => FlussError::OperationNotAttemptedException, + 28 => FlussError::NotEnoughReplicasAfterAppendException, + 29 => FlussError::NotEnoughReplicasException, + 30 => FlussError::SecurityTokenException, + 31 => FlussError::OutOfOrderSequenceException, + 32 => FlussError::DuplicateSequenceException, + 33 => FlussError::UnknownWriterIdException, + 34 => FlussError::InvalidColumnProjection, + 35 => FlussError::InvalidTargetColumn, + 36 => FlussError::PartitionNotExists, + 37 => FlussError::TableNotPartitionedException, + 38 => FlussError::InvalidTimestampException, + 39 => FlussError::InvalidConfigException, + 40 => FlussError::LakeStorageNotConfiguredException, + 41 => FlussError::KvSnapshotNotExist, + 42 => FlussError::PartitionAlreadyExists, + 43 => FlussError::PartitionSpecInvalidException, + 44 => FlussError::LeaderNotAvailableException, + 45 => FlussError::PartitionMaxNumException, + 46 => FlussError::AuthenticateException, + 47 => FlussError::SecurityDisabledException, + 48 => FlussError::AuthorizationException, + 49 => FlussError::BucketMaxNumException, + 50 => FlussError::FencedTieringEpochException, + 51 => FlussError::RetriableAuthenticateException, + 52 => FlussError::InvalidServerRackInfoException, + 53 => FlussError::LakeSnapshotNotExist, + 54 => FlussError::LakeTableAlreadyExist, + 55 => FlussError::IneligibleReplicaException, + 56 => FlussError::InvalidAlterTableException, + 57 => FlussError::DeletionDisabledException, + _ => FlussError::UnknownServerError, + } + } +} + +impl Display for FlussError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.message()) + } +} + +impl From for ApiError { + fn from(error_response: ErrorResponse) -> Self { + let fluss_error = FlussError::for_code(error_response.error_code); + fluss_error.to_api_error(error_response.error_message) + } +} + +impl From for FlussError { + fn from(api_error: ApiError) -> Self { + FlussError::for_code(api_error.code) + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/frame.rs b/fluss-rust/crates/fluss/src/rpc/frame.rs index 44dadc9408..81cc0946d7 100644 --- a/fluss-rust/crates/fluss/src/rpc/frame.rs +++ b/fluss-rust/crates/fluss/src/rpc/frame.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use prost::DecodeError; use thiserror::Error; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; @@ -29,6 +30,9 @@ pub enum ReadError { #[error("Message too large, limit is {limit} bytes but got {actual} bytes")] MessageTooLarge { limit: usize, actual: usize }, + + #[error("Fail to decode error response: {0}")] + ProtoErrorResponseDecodeError(#[from] DecodeError), } pub trait AsyncMessageRead { diff --git a/fluss-rust/crates/fluss/src/rpc/message/create_database.rs b/fluss-rust/crates/fluss/src/rpc/message/create_database.rs index e4052ef361..7d24235a57 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/create_database.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/create_database.rs @@ -22,7 +22,8 @@ use crate::error::Result as FlussResult; use crate::proto::CreateDatabaseResponse; use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::ReadError; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use bytes::{Buf, BufMut}; diff --git a/fluss-rust/crates/fluss/src/rpc/message/create_table.rs b/fluss-rust/crates/fluss/src/rpc/message/create_table.rs index 5802e71797..69865b8902 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/create_table.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/create_table.rs @@ -23,7 +23,8 @@ use crate::proto::CreateTableResponse; use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; use crate::rpc::convert::to_table_path; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::ReadError; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use bytes::{Buf, BufMut}; diff --git a/fluss-rust/crates/fluss/src/rpc/message/database_exists.rs b/fluss-rust/crates/fluss/src/rpc/message/database_exists.rs index 795eea1260..7e717a4e7a 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/database_exists.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/database_exists.rs @@ -15,9 +15,11 @@ // specific language governing permissions and limitations // under the License. +use crate::rpc::frame::ReadError; + use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use crate::{impl_read_version_type, impl_write_version_type, proto}; use bytes::{Buf, BufMut}; diff --git a/fluss-rust/crates/fluss/src/rpc/message/drop_database.rs b/fluss-rust/crates/fluss/src/rpc/message/drop_database.rs index 49cbfaf8d4..663e970a91 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/drop_database.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/drop_database.rs @@ -15,9 +15,11 @@ // specific language governing permissions and limitations // under the License. +use crate::rpc::frame::ReadError; + use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use crate::{impl_read_version_type, impl_write_version_type, proto}; use bytes::{Buf, BufMut}; diff --git a/fluss-rust/crates/fluss/src/rpc/message/drop_table.rs b/fluss-rust/crates/fluss/src/rpc/message/drop_table.rs index 0dbc21bbbe..a2b3f2d129 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/drop_table.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/drop_table.rs @@ -19,10 +19,12 @@ use crate::metadata::TablePath; use crate::{impl_read_version_type, impl_write_version_type, proto}; use crate::proto::DropTableResponse; +use crate::rpc::frame::ReadError; + use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; use crate::rpc::convert::to_table_path; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use bytes::{Buf, BufMut}; diff --git a/fluss-rust/crates/fluss/src/rpc/message/fetch.rs b/fluss-rust/crates/fluss/src/rpc/message/fetch.rs index 6ebc5a2b33..15876069d7 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/fetch.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/fetch.rs @@ -16,9 +16,11 @@ // under the License. use crate::proto::FetchLogResponse; +use crate::rpc::frame::ReadError; + use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use crate::{impl_read_version_type, impl_write_version_type, proto}; use prost::Message; diff --git a/fluss-rust/crates/fluss/src/rpc/message/get_database_info.rs b/fluss-rust/crates/fluss/src/rpc/message/get_database_info.rs index 85492a8bf0..6468bebd5f 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/get_database_info.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/get_database_info.rs @@ -15,9 +15,11 @@ // specific language governing permissions and limitations // under the License. +use crate::rpc::frame::ReadError; + use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use crate::{impl_read_version_type, impl_write_version_type, proto}; use bytes::{Buf, BufMut}; diff --git a/fluss-rust/crates/fluss/src/rpc/message/get_latest_lake_snapshot.rs b/fluss-rust/crates/fluss/src/rpc/message/get_latest_lake_snapshot.rs index a0e186efd2..a632a1596d 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/get_latest_lake_snapshot.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/get_latest_lake_snapshot.rs @@ -19,10 +19,12 @@ use crate::proto; use crate::proto::PbTablePath; use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use crate::metadata::TablePath; +use crate::rpc::frame::ReadError; + use crate::{impl_read_version_type, impl_write_version_type}; use bytes::{Buf, BufMut}; use prost::Message; diff --git a/fluss-rust/crates/fluss/src/rpc/message/get_table.rs b/fluss-rust/crates/fluss/src/rpc/message/get_table.rs index 4f4d6c7a41..61657f7a7c 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/get_table.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/get_table.rs @@ -18,10 +18,12 @@ use crate::proto::{GetTableInfoRequest, GetTableInfoResponse, PbTablePath}; use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use crate::metadata::TablePath; +use crate::rpc::frame::ReadError; + use crate::{impl_read_version_type, impl_write_version_type}; use bytes::{Buf, BufMut}; use prost::Message; diff --git a/fluss-rust/crates/fluss/src/rpc/message/header.rs b/fluss-rust/crates/fluss/src/rpc/message/header.rs index fe60f8c997..77bda7c78c 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/header.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/header.rs @@ -15,11 +15,13 @@ // specific language governing permissions and limitations // under the License. +use crate::proto::ErrorResponse; use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; use crate::rpc::frame::{ReadError, WriteError}; use crate::rpc::message::{ReadVersionedType, WriteVersionedType}; use bytes::{Buf, BufMut}; +use prost::Message; #[allow(dead_code)] const REQUEST_HEADER_LENGTH: i32 = 8; @@ -53,9 +55,10 @@ where } } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq)] pub struct ResponseHeader { pub request_id: i32, + pub error_response: Option, } impl ReadVersionedType for ResponseHeader @@ -64,10 +67,17 @@ where { fn read_versioned(reader: &mut R, _version: ApiVersion) -> Result { let resp_type = reader.get_u8(); + let request_id = reader.get_i32(); if resp_type != SUCCESS_RESPONSE { - todo!("handle unsuccess response type"); + let error_response = ErrorResponse::decode(reader)?; + return Ok(ResponseHeader { + request_id, + error_response: Some(error_response), + }); } - let request_id = reader.get_i32(); - Ok(ResponseHeader { request_id }) + Ok(ResponseHeader { + request_id, + error_response: None, + }) } } diff --git a/fluss-rust/crates/fluss/src/rpc/message/list_databases.rs b/fluss-rust/crates/fluss/src/rpc/message/list_databases.rs index ce5a091540..83226ab1a9 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/list_databases.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/list_databases.rs @@ -15,9 +15,11 @@ // specific language governing permissions and limitations // under the License. +use crate::rpc::frame::ReadError; + use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use crate::{impl_read_version_type, impl_write_version_type, proto}; use bytes::{Buf, BufMut}; diff --git a/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs b/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs index 500db33e8c..9ab1f143f3 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs @@ -20,9 +20,11 @@ use crate::{impl_read_version_type, impl_write_version_type, proto}; use crate::error::Error; use crate::error::Result as FlussResult; use crate::proto::ListOffsetsResponse; +use crate::rpc::frame::ReadError; + use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use std::collections::HashMap; @@ -108,12 +110,15 @@ impl ListOffsetsResponse { .map(|resp| { if resp.error_code.is_some() { // todo: consider use another suitable error - Err(Error::WriteError(format!( - "Missing offset, error message: {}", - resp.error_message - .as_deref() - .unwrap_or("unknown server exception") - ))) + Err(Error::UnexpectedError { + message: format!( + "Missing offset, error message: {}", + resp.error_message + .as_deref() + .unwrap_or("unknown server exception") + ), + source: None, + }) } else { // if no error msg, offset must exists Ok((resp.bucket_id, resp.offset.unwrap())) diff --git a/fluss-rust/crates/fluss/src/rpc/message/list_tables.rs b/fluss-rust/crates/fluss/src/rpc/message/list_tables.rs index daf57ea6b5..ff2497a063 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/list_tables.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/list_tables.rs @@ -18,9 +18,11 @@ use crate::{impl_read_version_type, impl_write_version_type, proto}; use crate::proto::ListTablesResponse; +use crate::rpc::frame::ReadError; + use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use bytes::{Buf, BufMut}; diff --git a/fluss-rust/crates/fluss/src/rpc/message/mod.rs b/fluss-rust/crates/fluss/src/rpc/message/mod.rs index 0ed5b7c0e6..b619ee4023 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/mod.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/mod.rs @@ -38,6 +38,7 @@ mod produce_log; mod table_exists; mod update_metadata; +pub use crate::rpc::RpcError; pub use create_database::*; pub use create_table::*; pub use database_exists::*; diff --git a/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs b/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs index 7da2b59a2d..39bfb3f205 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs @@ -17,9 +17,11 @@ use crate::error::Result as FlussResult; use crate::proto::{PbProduceLogReqForBucket, ProduceLogResponse}; +use crate::rpc::frame::ReadError; + use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use crate::{impl_read_version_type, impl_write_version_type, proto}; use std::sync::Arc; diff --git a/fluss-rust/crates/fluss/src/rpc/message/table_exists.rs b/fluss-rust/crates/fluss/src/rpc/message/table_exists.rs index 3b71f471ac..ec982116b4 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/table_exists.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/table_exists.rs @@ -22,12 +22,13 @@ use crate::proto::TableExistsResponse; use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; use crate::rpc::convert::to_table_path; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; +use crate::rpc::frame::ReadError; + use bytes::{Buf, BufMut}; use prost::Message; - #[derive(Debug)] pub struct TableExistsRequest { pub inner_request: proto::TableExistsRequest, diff --git a/fluss-rust/crates/fluss/src/rpc/message/update_metadata.rs b/fluss-rust/crates/fluss/src/rpc/message/update_metadata.rs index 0d8ad6464a..a6e6288609 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/update_metadata.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/update_metadata.rs @@ -18,10 +18,12 @@ use crate::proto::{MetadataResponse, PbTablePath}; use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; -use crate::rpc::frame::{ReadError, WriteError}; +use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use crate::metadata::TablePath; +use crate::rpc::frame::ReadError; + use crate::{impl_read_version_type, impl_write_version_type, proto}; use bytes::{Buf, BufMut}; use prost::Message; diff --git a/fluss-rust/crates/fluss/src/rpc/mod.rs b/fluss-rust/crates/fluss/src/rpc/mod.rs index b8705a3f65..86e13b1c4c 100644 --- a/fluss-rust/crates/fluss/src/rpc/mod.rs +++ b/fluss-rust/crates/fluss/src/rpc/mod.rs @@ -17,7 +17,9 @@ mod api_key; mod api_version; -mod error; +pub mod error; +mod fluss_api_error; +pub use fluss_api_error::{ApiError, FlussError}; mod frame; pub mod message; pub use error::*; diff --git a/fluss-rust/crates/fluss/src/rpc/server_connection.rs b/fluss-rust/crates/fluss/src/rpc/server_connection.rs index c474534b61..fdeb56fbbe 100644 --- a/fluss-rust/crates/fluss/src/rpc/server_connection.rs +++ b/fluss-rust/crates/fluss/src/rpc/server_connection.rs @@ -16,6 +16,7 @@ // under the License. use crate::cluster::ServerNode; +use crate::error::Error; use crate::rpc::api_version::ApiVersion; use crate::rpc::error::RpcError; use crate::rpc::error::RpcError::ConnectionError; @@ -230,7 +231,7 @@ where } } - pub async fn request(&self, msg: R) -> Result + pub async fn request(&self, msg: R) -> Result where R: RequestBody + Send + WriteVersionedType>, R::ResponseBody: ReadVersionedType>>, @@ -249,9 +250,12 @@ where let mut buf = Vec::new(); // write header - header.write_versioned(&mut buf, header_version)?; + header + .write_versioned(&mut buf, header_version) + .map_err(RpcError::WriteMessageError)?; // write message body - msg.write_versioned(&mut buf, body_api_version)?; + msg.write_versioned(&mut buf, body_api_version) + .map_err(RpcError::WriteMessageError)?; let (tx, rx) = channel(); @@ -264,14 +268,21 @@ where ConnectionState::RequestMap(map) => { map.insert(request_id, ActiveRequest { channel: tx }); } - ConnectionState::Poison(e) => return Err(RpcError::Poisoned(Arc::clone(e))), + ConnectionState::Poison(e) => return Err(RpcError::Poisoned(Arc::clone(e)).into()), } self.send_message(buf).await?; _cleanup_on_cancel.message_sent(); let mut response = rx.await.expect("Who closed this channel?!")?; - let body = R::ResponseBody::read_versioned(&mut response.data, body_api_version)?; + if let Some(error_response) = response.header.error_response { + return Err(Error::FlussAPIError { + api_error: crate::rpc::ApiError::from(error_response), + }); + } + + let body = R::ResponseBody::read_versioned(&mut response.data, body_api_version) + .map_err(RpcError::ReadMessageError)?; let read_bytes = response.data.position(); let message_bytes = response.data.into_inner().len() as u64; @@ -281,7 +292,8 @@ where read: read_bytes, api_key: R::API_KEY, api_version: body_api_version, - }); + } + .into()); } Ok(body) } diff --git a/fluss-rust/crates/fluss/tests/integration/admin.rs b/fluss-rust/crates/fluss/tests/integration/admin.rs index 0086d9c03e..ccb717228e 100644 --- a/fluss-rust/crates/fluss/tests/integration/admin.rs +++ b/fluss-rust/crates/fluss/tests/integration/admin.rs @@ -34,6 +34,7 @@ static SHARED_FLUSS_CLUSTER: LazyLock>>> mod admin_test { use super::SHARED_FLUSS_CLUSTER; use crate::integration::fluss_cluster::{FlussTestingCluster, FlussTestingClusterBuilder}; + use fluss::error::FlussError; use fluss::metadata::{ DataTypes, DatabaseDescriptorBuilder, KvFormat, LogFormat, Schema, TableDescriptor, TablePath, @@ -251,4 +252,35 @@ mod admin_test { // database shouldn't exist now assert_eq!(admin.database_exists(test_db_name).await.unwrap(), false); } + + #[tokio::test] + async fn test_fluss_error_response() { + let cluster = get_fluss_cluster(); + let connection = cluster.get_fluss_connection().await; + let admin = connection + .get_admin() + .await + .expect("Failed to get admin client"); + + let table_path = TablePath::new("fluss".to_string(), "not_exist".to_string()); + + let result = admin.get_table(&table_path).await; + assert!(result.is_err(), "Expected error but got Ok"); + + let error = result.unwrap_err(); + match error { + fluss::error::Error::FlussAPIError { api_error } => { + assert_eq!( + api_error.code, + FlussError::TableNotExist.code(), + "Expected error code 7 (TableNotExist)" + ); + assert_eq!( + api_error.message, "Table 'fluss.not_exist' does not exist.", + "Expected specific error message" + ); + } + other => panic!("Expected FlussAPIError, got {:?}", other), + } + } } diff --git a/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs b/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs index f52d526e37..ca61ff8502 100644 --- a/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs +++ b/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs @@ -38,8 +38,6 @@ mod table_remote_scan_test { use fluss::row::{GenericRow, InternalRow}; use std::collections::HashMap; use std::sync::Arc; - use std::sync::atomic::AtomicUsize; - use std::sync::atomic::Ordering; use std::thread; use std::thread::sleep; use std::time::Duration; @@ -89,11 +87,13 @@ mod table_remote_scan_test { temp_dir.to_string_lossy().to_string(), ); - let cluster = - FlussTestingClusterBuilder::new_with_cluster_conf("test_table", &cluster_conf) - .with_remote_data_dir(temp_dir) - .build() - .await; + let cluster = FlussTestingClusterBuilder::new_with_cluster_conf( + "test_table_remote", + &cluster_conf, + ) + .with_remote_data_dir(temp_dir) + .build() + .await; let mut guard = cluster_guard.write(); *guard = Some(cluster); }); From 7772f15e8b1348549b2aa22cd3f63cc4a0e34b6f Mon Sep 17 00:00:00 2001 From: Kelvin Wu Date: Sun, 21 Dec 2025 12:44:26 +0800 Subject: [PATCH 038/287] chore: Update the InternalRow Trait: get_binary, get_bytes to return &[u8] (#104) --- fluss-rust/bindings/cpp/src/types.rs | 4 ++-- fluss-rust/crates/fluss/src/row/column.rs | 6 ++---- fluss-rust/crates/fluss/src/row/mod.rs | 12 ++++++------ 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/fluss-rust/bindings/cpp/src/types.rs b/fluss-rust/bindings/cpp/src/types.rs index f9404ac633..8221f22c9f 100644 --- a/fluss-rust/bindings/cpp/src/types.rs +++ b/fluss-rust/bindings/cpp/src/types.rs @@ -332,12 +332,12 @@ fn core_row_to_ffi_fields(row: &fcore::row::ColumnarRow) -> Vec { ArrowDataType::Binary => { let mut datum = new_datum(DATUM_TYPE_BYTES); // todo: avoid copy bytes for blob - datum.bytes_val = row.get_bytes(i); + datum.bytes_val = row.get_bytes(i).to_vec(); datum } ArrowDataType::FixedSizeBinary(len) => { let mut datum = new_datum(DATUM_TYPE_BYTES); - datum.bytes_val = row.get_binary(i, *len as usize); + datum.bytes_val = row.get_binary(i, *len as usize).to_vec(); datum } ArrowDataType::LargeBinary => { diff --git a/fluss-rust/crates/fluss/src/row/column.rs b/fluss-rust/crates/fluss/src/row/column.rs index 6d47836d9b..20d86c0297 100644 --- a/fluss-rust/crates/fluss/src/row/column.rs +++ b/fluss-rust/crates/fluss/src/row/column.rs @@ -156,23 +156,21 @@ impl InternalRow for ColumnarRow { .value(self.row_id) } - fn get_binary(&self, pos: usize, _length: usize) -> Vec { + fn get_binary(&self, pos: usize, _length: usize) -> &[u8] { self.record_batch .column(pos) .as_any() .downcast_ref::() .expect("Expected binary array.") .value(self.row_id) - .to_vec() } - fn get_bytes(&self, pos: usize) -> Vec { + fn get_bytes(&self, pos: usize) -> &[u8] { self.record_batch .column(pos) .as_any() .downcast_ref::() .expect("Expected bytes array.") .value(self.row_id) - .to_vec() } } diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index 909f3b136f..dd1dedfeac 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -66,10 +66,10 @@ pub trait InternalRow { // fn get_timestamp_ltz(&self, pos: usize, precision: usize) -> TimestampLtz; /// Returns the binary value at the given position with fixed length - fn get_binary(&self, pos: usize, length: usize) -> Vec; + fn get_binary(&self, pos: usize, length: usize) -> &[u8]; /// Returns the binary value at the given position - fn get_bytes(&self, pos: usize) -> Vec; + fn get_bytes(&self, pos: usize) -> &[u8]; } pub struct GenericRow<'a> { @@ -132,12 +132,12 @@ impl<'a> InternalRow for GenericRow<'a> { self.values.get(pos).unwrap().try_into().unwrap() } - fn get_binary(&self, pos: usize, _length: usize) -> Vec { - self.values.get(pos).unwrap().as_blob().to_vec() + fn get_binary(&self, pos: usize, _length: usize) -> &[u8] { + self.values.get(pos).unwrap().as_blob() } - fn get_bytes(&self, pos: usize) -> Vec { - self.values.get(pos).unwrap().as_blob().to_vec() + fn get_bytes(&self, pos: usize) -> &[u8] { + self.values.get(pos).unwrap().as_blob() } } From f4b0628b21feb290a30b320c5c72d10b0da2a004 Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Sun, 21 Dec 2025 13:06:02 +0800 Subject: [PATCH 039/287] chore: optimize get_char method (#105) --- fluss-rust/crates/fluss/src/row/column.rs | 14 +++----------- fluss-rust/crates/fluss/src/row/mod.rs | 15 ++++----------- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/fluss-rust/crates/fluss/src/row/column.rs b/fluss-rust/crates/fluss/src/row/column.rs index 20d86c0297..31f0fdf298 100644 --- a/fluss-rust/crates/fluss/src/row/column.rs +++ b/fluss-rust/crates/fluss/src/row/column.rs @@ -126,7 +126,7 @@ impl InternalRow for ColumnarRow { .value(self.row_id) } - fn get_char(&self, pos: usize, length: usize) -> String { + fn get_char(&self, pos: usize, _length: usize) -> &str { let array = self .record_batch .column(pos) @@ -135,16 +135,8 @@ impl InternalRow for ColumnarRow { .expect("Expected fixed-size binary array for char type"); let bytes = array.value(self.row_id); - if bytes.len() != length { - panic!( - "Length mismatch for fixed-size char: expected {}, got {}", - length, - bytes.len() - ); - } - - String::from_utf8(bytes.to_vec()) - .unwrap_or_else(|_| String::from_utf8_lossy(bytes).into_owned()) + // don't check length, following java client + std::str::from_utf8(bytes).expect("Invalid UTF-8 in char field") } fn get_string(&self, pos: usize) -> &str { diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index dd1dedfeac..01b89fc9f4 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -51,7 +51,7 @@ pub trait InternalRow { fn get_double(&self, pos: usize) -> f64; /// Returns the string value at the given position with fixed length - fn get_char(&self, pos: usize, length: usize) -> String; + fn get_char(&self, pos: usize, length: usize) -> &str; /// Returns the string value at the given position fn get_string(&self, pos: usize) -> &str; @@ -116,16 +116,9 @@ impl<'a> InternalRow for GenericRow<'a> { self.values.get(pos).unwrap().try_into().unwrap() } - fn get_char(&self, pos: usize, length: usize) -> String { - let value = self.get_string(pos); - if value.len() != length { - panic!( - "Length mismatch for fixed-size char: expected {}, got {}", - length, - value.len() - ); - } - value.to_string() + fn get_char(&self, pos: usize, _length: usize) -> &str { + // don't check length, following java client + self.get_string(pos) } fn get_string(&self, pos: usize) -> &str { From f55375905c42bc6e3d9e2a526928a950d0d190b2 Mon Sep 17 00:00:00 2001 From: AlexZhao Date: Sun, 21 Dec 2025 15:19:56 +0800 Subject: [PATCH 040/287] feat: support ListOffset/SubscribeBatch/DropTable for cpp bindings (#100) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --------- Co-authored-by: 赵海源 --- fluss-rust/bindings/cpp/examples/example.cpp | 99 +++++++++++++- fluss-rust/bindings/cpp/include/fluss.hpp | 32 +++++ fluss-rust/bindings/cpp/src/admin.cpp | 42 ++++++ fluss-rust/bindings/cpp/src/ffi_converter.hpp | 1 + fluss-rust/bindings/cpp/src/lib.rs | 122 ++++++++++++++++++ fluss-rust/bindings/cpp/src/table.cpp | 17 +++ fluss-rust/bindings/cpp/src/types.rs | 4 +- fluss-rust/crates/fluss/src/client/admin.rs | 7 + .../crates/fluss/src/client/table/scanner.rs | 22 ++++ 9 files changed, 340 insertions(+), 6 deletions(-) diff --git a/fluss-rust/bindings/cpp/examples/example.cpp b/fluss-rust/bindings/cpp/examples/example.cpp index 5146f28216..04f9ac64f5 100644 --- a/fluss-rust/bindings/cpp/examples/example.cpp +++ b/fluss-rust/bindings/cpp/examples/example.cpp @@ -19,6 +19,8 @@ #include #include +#include +#include static void check(const char* step, const fluss::Result& r) { if (!r.Ok()) { @@ -37,6 +39,17 @@ int main() { fluss::Admin admin; check("get_admin", conn.GetAdmin(admin)); + fluss::TablePath table_path("fluss", "sample_table_cpp_v1"); + + // 2.1) Drop table if exists + std::cout << "Dropping table if exists..." << std::endl; + auto drop_result = admin.DropTable(table_path, true); + if (drop_result.Ok()) { + std::cout << "Table dropped successfully" << std::endl; + } else { + std::cout << "Table drop result: " << drop_result.error_message << std::endl; + } + // 3) Schema & descriptor auto schema = fluss::Schema::NewBuilder() .AddColumn("id", fluss::DataType::Int) @@ -47,14 +60,14 @@ int main() { auto descriptor = fluss::TableDescriptor::NewBuilder() .SetSchema(schema) - .SetBucketCount(1) + .SetBucketCount(3) .SetProperty("table.log.arrow.compression.type", "NONE") - .SetComment("cpp example table") + .SetComment("cpp example table with 3 buckets") .Build(); - fluss::TablePath table_path("fluss", "sample_table_cpp_v1"); - // ignore_if_exists=true to allow re-run - check("create_table", admin.CreateTable(table_path, descriptor, true)); + // 3.1) Create table with 3 buckets + std::cout << "Creating table with 3 buckets..." << std::endl; + check("create_table", admin.CreateTable(table_path, descriptor, false)); // 4) Get table fluss::Table table; @@ -162,5 +175,81 @@ int main() { std::exit(1); } + // 8) List offsets examples + std::cout << "\n=== List Offsets Examples ===" << std::endl; + + // 8.1) Query earliest offsets for all buckets + std::vector all_bucket_ids; + for (int b = 0; b < buckets; ++b) { + all_bucket_ids.push_back(b); + } + + std::unordered_map earliest_offsets; + check("list_earliest_offsets", + admin.ListOffsets(table_path, all_bucket_ids, + fluss::OffsetQuery::Earliest(), + earliest_offsets)); + std::cout << "Earliest offsets:" << std::endl; + for (const auto& [bucket_id, offset] : earliest_offsets) { + std::cout << " Bucket " << bucket_id << ": offset=" << offset << std::endl; + } + + // 8.2) Query latest offsets for all buckets + std::unordered_map latest_offsets; + check("list_latest_offsets", + admin.ListOffsets(table_path, all_bucket_ids, + fluss::OffsetQuery::Latest(), + latest_offsets)); + std::cout << "Latest offsets:" << std::endl; + for (const auto& [bucket_id, offset] : latest_offsets) { + std::cout << " Bucket " << bucket_id << ": offset=" << offset << std::endl; + } + + // 8.3) Query offsets for a specific timestamp (current time - 1 hour) + auto now = std::chrono::system_clock::now(); + auto one_hour_ago = now - std::chrono::hours(1); + auto timestamp_ms = std::chrono::duration_cast( + one_hour_ago.time_since_epoch()).count(); + + std::unordered_map timestamp_offsets; + check("list_timestamp_offsets", + admin.ListOffsets(table_path, all_bucket_ids, + fluss::OffsetQuery::FromTimestamp(timestamp_ms), + timestamp_offsets)); + std::cout << "Offsets for timestamp " << timestamp_ms << " (1 hour ago):" << std::endl; + for (const auto& [bucket_id, offset] : timestamp_offsets) { + std::cout << " Bucket " << bucket_id << ": offset=" << offset << std::endl; + } + + // 8.4) Use batch subscribe with offsets from list_offsets + std::cout << "\n=== Batch Subscribe Example ===" << std::endl; + fluss::LogScanner batch_scanner; + check("new_log_scanner_for_batch", table.NewLogScanner(batch_scanner)); + + std::vector subscriptions; + for (const auto& [bucket_id, offset] : earliest_offsets) { + subscriptions.push_back({bucket_id, offset}); + std::cout << "Preparing subscription: bucket=" << bucket_id + << ", offset=" << offset << std::endl; + } + + check("subscribe_batch", batch_scanner.Subscribe(subscriptions)); + std::cout << "Batch subscribed to " << subscriptions.size() << " buckets" << std::endl; + + // 8.5) Poll and verify bucket_id in records + fluss::ScanRecords batch_records; + check("poll_batch", batch_scanner.Poll(5000, batch_records)); + + std::cout << "Scanned " << batch_records.Size() << " records from batch subscription" << std::endl; + for (size_t i = 0; i < batch_records.Size() && i < 5; ++i) { + const auto& rec = batch_records[i]; + std::cout << " Record " << i << ": bucket_id=" << rec.bucket_id + << ", offset=" << rec.offset + << ", timestamp=" << rec.timestamp << std::endl; + } + if (batch_records.Size() > 5) { + std::cout << " ... and " << (batch_records.Size() - 5) << " more records" << std::endl; + } + return 0; } diff --git a/fluss-rust/bindings/cpp/include/fluss.hpp b/fluss-rust/bindings/cpp/include/fluss.hpp index 002f80694a..479adf97a9 100644 --- a/fluss-rust/bindings/cpp/include/fluss.hpp +++ b/fluss-rust/bindings/cpp/include/fluss.hpp @@ -63,6 +63,24 @@ enum class DatumType { Bytes = 7, }; +constexpr int64_t EARLIEST_OFFSET = -2; +constexpr int64_t LATEST_OFFSET = -1; + +enum class OffsetSpec { + Earliest = 0, + Latest = 1, + Timestamp = 2, +}; + +struct OffsetQuery { + OffsetSpec spec; + int64_t timestamp{0}; + + static OffsetQuery Earliest() { return {OffsetSpec::Earliest, 0}; } + static OffsetQuery Latest() { return {OffsetSpec::Latest, 0}; } + static OffsetQuery FromTimestamp(int64_t ts) { return {OffsetSpec::Timestamp, ts}; } +}; + struct Result { int32_t error_code{0}; std::string error_message; @@ -301,6 +319,7 @@ struct GenericRow { }; struct ScanRecord { + int32_t bucket_id; int64_t offset; int64_t timestamp; GenericRow row; @@ -324,6 +343,11 @@ struct BucketOffset { int64_t offset; }; +struct BucketSubscription { + int32_t bucket_id; + int64_t offset; +}; + struct LakeSnapshot { int64_t snapshot_id; std::vector bucket_offsets; @@ -372,10 +396,17 @@ class Admin { const TableDescriptor& descriptor, bool ignore_if_exists = false); + Result DropTable(const TablePath& table_path, bool ignore_if_not_exists = false); + Result GetTable(const TablePath& table_path, TableInfo& out); Result GetLatestLakeSnapshot(const TablePath& table_path, LakeSnapshot& out); + Result ListOffsets(const TablePath& table_path, + const std::vector& bucket_ids, + const OffsetQuery& offset_query, + std::unordered_map& out); + private: friend class Connection; Admin(ffi::Admin* admin) noexcept; @@ -448,6 +479,7 @@ class LogScanner { bool Available() const; Result Subscribe(int32_t bucket_id, int64_t start_offset); + Result Subscribe(const std::vector& bucket_offsets); Result Poll(int64_t timeout_ms, ScanRecords& out); private: diff --git a/fluss-rust/bindings/cpp/src/admin.cpp b/fluss-rust/bindings/cpp/src/admin.cpp index f6997a640a..bf9c712f4e 100644 --- a/fluss-rust/bindings/cpp/src/admin.cpp +++ b/fluss-rust/bindings/cpp/src/admin.cpp @@ -66,6 +66,16 @@ Result Admin::CreateTable(const TablePath& table_path, return utils::from_ffi_result(ffi_result); } +Result Admin::DropTable(const TablePath& table_path, bool ignore_if_not_exists) { + if (!Available()) { + return utils::make_error(1, "Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + auto ffi_result = admin_->drop_table(ffi_path, ignore_if_not_exists); + return utils::from_ffi_result(ffi_result); +} + Result Admin::GetTable(const TablePath& table_path, TableInfo& out) { if (!Available()) { return utils::make_error(1, "Admin not available"); @@ -98,4 +108,36 @@ Result Admin::GetLatestLakeSnapshot(const TablePath& table_path, LakeSnapshot& o return result; } +Result Admin::ListOffsets(const TablePath& table_path, + const std::vector& bucket_ids, + const OffsetQuery& offset_query, + std::unordered_map& out) { + if (!Available()) { + return utils::make_error(1, "Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + + rust::Vec rust_bucket_ids; + for (int32_t id : bucket_ids) { + rust_bucket_ids.push_back(id); + } + + ffi::FfiOffsetQuery ffi_query; + ffi_query.offset_type = static_cast(offset_query.spec); + ffi_query.timestamp = offset_query.timestamp; + + auto ffi_result = admin_->list_offsets(ffi_path, std::move(rust_bucket_ids), ffi_query); + + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out.clear(); + for (const auto& pair : ffi_result.bucket_offsets) { + out[pair.bucket_id] = pair.offset; + } + } + + return result; +} + } // namespace fluss diff --git a/fluss-rust/bindings/cpp/src/ffi_converter.hpp b/fluss-rust/bindings/cpp/src/ffi_converter.hpp index 52dd7fe5d4..63a2e91a7b 100644 --- a/fluss-rust/bindings/cpp/src/ffi_converter.hpp +++ b/fluss-rust/bindings/cpp/src/ffi_converter.hpp @@ -222,6 +222,7 @@ inline GenericRow from_ffi_generic_row(const ffi::FfiGenericRow& ffi_row) { inline ScanRecord from_ffi_scan_record(const ffi::FfiScanRecord& ffi_record) { return ScanRecord{ + ffi_record.bucket_id, ffi_record.offset, ffi_record.timestamp, from_ffi_generic_row(ffi_record.row)}; diff --git a/fluss-rust/bindings/cpp/src/lib.rs b/fluss-rust/bindings/cpp/src/lib.rs index 54d6941379..cd1803b888 100644 --- a/fluss-rust/bindings/cpp/src/lib.rs +++ b/fluss-rust/bindings/cpp/src/lib.rs @@ -104,6 +104,7 @@ mod ffi { } struct FfiScanRecord { + bucket_id: i32, offset: i64, timestamp: i64, row: FfiGenericRow, @@ -130,6 +131,26 @@ mod ffi { offset: i64, } + struct FfiOffsetQuery { + offset_type: i32, + timestamp: i64, + } + + struct FfiBucketSubscription { + bucket_id: i32, + offset: i64, + } + + struct FfiBucketOffsetPair { + bucket_id: i32, + offset: i64, + } + + struct FfiListOffsetsResult { + result: FfiResult, + bucket_offsets: Vec, + } + struct FfiLakeSnapshotResult { result: FfiResult, lake_snapshot: FfiLakeSnapshot, @@ -156,11 +177,22 @@ mod ffi { descriptor: &FfiTableDescriptor, ignore_if_exists: bool, ) -> FfiResult; + fn drop_table( + self: &Admin, + table_path: &FfiTablePath, + ignore_if_not_exists: bool, + ) -> FfiResult; fn get_table_info(self: &Admin, table_path: &FfiTablePath) -> FfiTableInfoResult; fn get_latest_lake_snapshot( self: &Admin, table_path: &FfiTablePath, ) -> FfiLakeSnapshotResult; + fn list_offsets( + self: &Admin, + table_path: &FfiTablePath, + bucket_ids: Vec, + offset_query: &FfiOffsetQuery, + ) -> FfiListOffsetsResult; // Table unsafe fn delete_table(table: *mut Table); @@ -182,6 +214,10 @@ mod ffi { // LogScanner unsafe fn delete_log_scanner(scanner: *mut LogScanner); fn subscribe(self: &LogScanner, bucket_id: i32, start_offset: i64) -> FfiResult; + fn subscribe_batch( + self: &LogScanner, + subscriptions: Vec, + ) -> FfiResult; fn poll(self: &LogScanner, timeout_ms: i64) -> FfiScanRecordsResult; } } @@ -330,6 +366,25 @@ impl Admin { } } + fn drop_table( + &self, + table_path: &ffi::FfiTablePath, + ignore_if_not_exists: bool, + ) -> ffi::FfiResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let result = + RUNTIME.block_on(async { self.inner.drop_table(&path, ignore_if_not_exists).await }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_result(1, e.to_string()), + } + } + fn get_table_info(&self, table_path: &ffi::FfiTablePath) -> ffi::FfiTableInfoResult { let path = fcore::metadata::TablePath::new( table_path.database_name.clone(), @@ -375,6 +430,58 @@ impl Admin { }, } } + + fn list_offsets( + &self, + table_path: &ffi::FfiTablePath, + bucket_ids: Vec, + offset_query: &ffi::FfiOffsetQuery, + ) -> ffi::FfiListOffsetsResult { + use fcore::rpc::message::OffsetSpec; + + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let offset_spec = match offset_query.offset_type { + 0 => OffsetSpec::Earliest, + 1 => OffsetSpec::Latest, + 2 => OffsetSpec::Timestamp(offset_query.timestamp), + _ => { + return ffi::FfiListOffsetsResult { + result: err_result( + 1, + format!("Invalid offset_type: {}", offset_query.offset_type), + ), + bucket_offsets: vec![], + }; + } + }; + + let result = RUNTIME.block_on(async { + self.inner + .list_offsets(&path, &bucket_ids, offset_spec) + .await + }); + + match result { + Ok(offsets) => { + let bucket_offsets: Vec = offsets + .into_iter() + .map(|(bucket_id, offset)| ffi::FfiBucketOffsetPair { bucket_id, offset }) + .collect(); + ffi::FfiListOffsetsResult { + result: ok_result(), + bucket_offsets, + } + } + Err(e) => ffi::FfiListOffsetsResult { + result: err_result(1, e.to_string()), + bucket_offsets: vec![], + }, + } + } } // Table implementation @@ -511,6 +618,21 @@ impl LogScanner { } } + fn subscribe_batch(&self, subscriptions: Vec) -> ffi::FfiResult { + use std::collections::HashMap; + let mut bucket_offsets = HashMap::new(); + for sub in subscriptions { + bucket_offsets.insert(sub.bucket_id, sub.offset); + } + + let result = RUNTIME.block_on(async { self.inner.subscribe_batch(bucket_offsets).await }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_result(1, e.to_string()), + } + } + fn poll(&self, timeout_ms: i64) -> ffi::FfiScanRecordsResult { let timeout = Duration::from_millis(timeout_ms as u64); let result = RUNTIME.block_on(async { self.inner.poll(timeout).await }); diff --git a/fluss-rust/bindings/cpp/src/table.cpp b/fluss-rust/bindings/cpp/src/table.cpp index b28b783ee8..d42e1a2299 100644 --- a/fluss-rust/bindings/cpp/src/table.cpp +++ b/fluss-rust/bindings/cpp/src/table.cpp @@ -210,6 +210,23 @@ Result LogScanner::Subscribe(int32_t bucket_id, int64_t start_offset) { return utils::from_ffi_result(ffi_result); } +Result LogScanner::Subscribe(const std::vector& bucket_offsets) { + if (!Available()) { + return utils::make_error(1, "LogScanner not available"); + } + + rust::Vec rust_subs; + for (const auto& sub : bucket_offsets) { + ffi::FfiBucketSubscription ffi_sub; + ffi_sub.bucket_id = sub.bucket_id; + ffi_sub.offset = sub.offset; + rust_subs.push_back(ffi_sub); + } + + auto ffi_result = scanner_->subscribe_batch(std::move(rust_subs)); + return utils::from_ffi_result(ffi_result); +} + Result LogScanner::Poll(int64_t timeout_ms, ScanRecords& out) { if (!Available()) { return utils::make_error(1, "LogScanner not available"); diff --git a/fluss-rust/bindings/cpp/src/types.rs b/fluss-rust/bindings/cpp/src/types.rs index 8221f22c9f..d95da14212 100644 --- a/fluss-rust/bindings/cpp/src/types.rs +++ b/fluss-rust/bindings/cpp/src/types.rs @@ -233,12 +233,14 @@ pub fn core_scan_records_to_ffi(records: &fcore::record::ScanRecords) -> ffi::Ff let mut ffi_records = Vec::new(); // Iterate over all buckets and their records - for bucket_records in records.records_by_buckets().values() { + for (table_bucket, bucket_records) in records.records_by_buckets() { + let bucket_id = table_bucket.bucket_id(); for record in bucket_records { let row = record.row(); let fields = core_row_to_ffi_fields(row); ffi_records.push(ffi::FfiScanRecord { + bucket_id, offset: record.offset(), timestamp: record.timestamp(), row: ffi::FfiGenericRow { fields }, diff --git a/fluss-rust/crates/fluss/src/client/admin.rs b/fluss-rust/crates/fluss/src/client/admin.rs index e185af84ac..6646f97cf8 100644 --- a/fluss-rust/crates/fluss/src/client/admin.rs +++ b/fluss-rust/crates/fluss/src/client/admin.rs @@ -232,6 +232,13 @@ impl FlussAdmin { .check_and_update_table_metadata(from_ref(table_path)) .await?; + if buckets_id.is_empty() { + return Err(Error::UnexpectedError { + message: "Buckets are empty.".to_string(), + source: None, + }); + } + let cluster = self.metadata.get_cluster(); let table_id = cluster.get_table(table_path).table_id; diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index 1e70649e0b..a9384d905a 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -185,6 +185,28 @@ impl LogScanner { Ok(()) } + pub async fn subscribe_batch(&self, bucket_offsets: HashMap) -> Result<()> { + self.metadata + .check_and_update_table_metadata(from_ref(&self.table_path)) + .await?; + if bucket_offsets.is_empty() { + return Err(Error::UnexpectedError { + message: "Bucket offsets are empty.".to_string(), + source: None, + }); + } + + let mut scan_bucket_offsets = HashMap::new(); + for (bucket_id, offset) in bucket_offsets { + let table_bucket = TableBucket::new(self.table_id, bucket_id); + scan_bucket_offsets.insert(table_bucket, offset); + } + + self.log_scanner_status + .assign_scan_buckets(scan_bucket_offsets); + Ok(()) + } + async fn poll_for_fetches(&self) -> Result>> { self.log_fetcher.send_fetches_and_collect().await } From 39189175320e7453e8ad80537d61541741a82d54 Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Sun, 21 Dec 2025 15:22:05 +0800 Subject: [PATCH 041/287] chore: introduce prefetch to improve log poll performance (#103) --- fluss-rust/crates/fluss/Cargo.toml | 1 + .../crates/fluss/src/client/credentials.rs | 30 +- .../src/client/table/log_fetch_buffer.rs | 376 +++++++++++ .../crates/fluss/src/client/table/mod.rs | 1 + .../fluss/src/client/table/remote_log.rs | 195 ++++-- .../crates/fluss/src/client/table/scanner.rs | 586 ++++++++++++++---- fluss-rust/crates/fluss/src/record/arrow.rs | 113 ++-- .../crates/fluss/tests/integration/table.rs | 6 +- .../tests/integration/table_remote_scan.rs | 8 +- 9 files changed, 1077 insertions(+), 239 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index cdba9de5a4..27604eecd8 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -58,6 +58,7 @@ url = "2.5.7" uuid = { version = "1.10", features = ["v4"] } tempfile = "3.23.0" snafu = "0.8.3" +scopeguard = "1.2.0" [target.'cfg(target_arch = "wasm32")'.dependencies] jiff = { workspace = true, features = ["js"] } diff --git a/fluss-rust/crates/fluss/src/client/credentials.rs b/fluss-rust/crates/fluss/src/client/credentials.rs index 6b07d08eb1..8adfe48b99 100644 --- a/fluss-rust/crates/fluss/src/client/credentials.rs +++ b/fluss-rust/crates/fluss/src/client/credentials.rs @@ -90,20 +90,20 @@ fn convert_hadoop_key_to_opendal(hadoop_key: &str) -> Option<(String, bool)> { pub struct CredentialsCache { inner: RwLock>, + rpc_client: Arc, + metadata: Arc, } impl CredentialsCache { - pub fn new() -> Self { + pub fn new(rpc_client: Arc, metadata: Arc) -> Self { Self { inner: RwLock::new(None), + rpc_client, + metadata, } } - pub async fn get_or_refresh( - &self, - rpc_client: &Arc, - metadata: &Arc, - ) -> Result> { + pub async fn get_or_refresh(&self) -> Result> { { let guard = self.inner.read(); if let Some(cached) = guard.as_ref() { @@ -113,17 +113,13 @@ impl CredentialsCache { } } - self.refresh_from_server(rpc_client, metadata).await + self.refresh_from_server().await } - async fn refresh_from_server( - &self, - rpc_client: &Arc, - metadata: &Arc, - ) -> Result> { - let cluster = metadata.get_cluster(); + async fn refresh_from_server(&self) -> Result> { + let cluster = self.metadata.get_cluster(); let server_node = cluster.get_one_available_server(); - let conn = rpc_client.get_connection(server_node).await?; + let conn = self.rpc_client.get_connection(server_node).await?; let request = GetSecurityTokenRequest::new(); let response = conn.request(request).await?; @@ -158,9 +154,3 @@ impl CredentialsCache { Ok(props) } } - -impl Default for CredentialsCache { - fn default() -> Self { - Self::new() - } -} diff --git a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs new file mode 100644 index 0000000000..cee104e020 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs @@ -0,0 +1,376 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Result; +use crate::metadata::TableBucket; +use crate::record::{ + LogRecordBatch, LogRecordIterator, LogRecordsBatches, ReadContext, ScanRecord, +}; +use parking_lot::Mutex; +use std::collections::{HashMap, VecDeque}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; +use tokio::sync::Notify; + +/// Represents a completed fetch that can be consumed +pub trait CompletedFetch: Send + Sync { + fn table_bucket(&self) -> &TableBucket; + fn fetch_records(&mut self, max_records: usize) -> Result>; + fn is_consumed(&self) -> bool; + fn drain(&mut self); + fn size_in_bytes(&self) -> usize; + fn high_watermark(&self) -> i64; + fn is_initialized(&self) -> bool; + fn set_initialized(&mut self); + fn next_fetch_offset(&self) -> i64; +} + +/// Represents a pending fetch that is waiting to be completed +pub trait PendingFetch: Send + Sync { + fn table_bucket(&self) -> &TableBucket; + fn is_completed(&self) -> bool; + fn to_completed_fetch(self: Box) -> Result>; +} + +/// Thread-safe buffer for completed fetches +pub struct LogFetchBuffer { + completed_fetches: Mutex>>, + pending_fetches: Mutex>>>, + next_in_line_fetch: Mutex>>, + not_empty_notify: Notify, + woken_up: Arc, +} + +impl LogFetchBuffer { + pub fn new() -> Self { + Self { + completed_fetches: Mutex::new(VecDeque::new()), + pending_fetches: Mutex::new(HashMap::new()), + next_in_line_fetch: Mutex::new(None), + not_empty_notify: Notify::new(), + woken_up: Arc::new(AtomicBool::new(false)), + } + } + + /// Check if the buffer is empty + pub fn is_empty(&self) -> bool { + self.completed_fetches.lock().is_empty() + } + + /// Wait for the buffer to become non-empty, with timeout + /// Returns true if data became available, false if timeout + pub async fn await_not_empty(&self, timeout: Duration) -> bool { + let deadline = std::time::Instant::now() + timeout; + + loop { + // Check if buffer is not empty + if !self.is_empty() { + return true; + } + + // Check if woken up + if self.woken_up.swap(false, Ordering::Acquire) { + return true; + } + + // Check if timeout + let now = std::time::Instant::now(); + if now >= deadline { + return false; + } + + // Wait for notification with remaining time + let remaining = deadline - now; + let notified = self.not_empty_notify.notified(); + tokio::select! { + _ = tokio::time::sleep(remaining) => { + return false; // Timeout + } + _ = notified => { + // Got notification, check again + continue; + } + } + } + } + + #[allow(dead_code)] + /// Wake up any waiting threads + pub fn wakeup(&self) { + self.woken_up.store(true, Ordering::Release); + self.not_empty_notify.notify_waiters(); + } + + /// Add a pending fetch to the buffer + pub fn pend(&self, pending_fetch: Box) { + let table_bucket = pending_fetch.table_bucket().clone(); + self.pending_fetches + .lock() + .entry(table_bucket) + .or_default() + .push_back(pending_fetch); + } + + /// Try to complete pending fetches in order, converting them to completed fetches + pub fn try_complete(&self, table_bucket: &TableBucket) { + // Collect completed fetches while holding the pending_fetches lock, + // then push them to completed_fetches after releasing it to avoid + // holding both locks simultaneously. + let mut completed_to_push: Vec> = Vec::new(); + let mut has_completed = false; + { + let mut pending_map = self.pending_fetches.lock(); + if let Some(pendings) = pending_map.get_mut(table_bucket) { + while let Some(front) = pendings.front() { + if front.is_completed() { + let pending = pendings.pop_front().unwrap(); + match pending.to_completed_fetch() { + Ok(completed) => { + completed_to_push.push(completed); + has_completed = true; + } + Err(e) => { + // todo: handle exception? + log::error!("Error when completing: {e}"); + } + } + } else { + break; + } + } + if has_completed && pendings.is_empty() { + pending_map.remove(table_bucket); + } + } + } + + if !completed_to_push.is_empty() { + let mut completed_queue = self.completed_fetches.lock(); + for completed in completed_to_push { + completed_queue.push_back(completed); + } + } + + if has_completed { + // Signal that buffer is not empty + self.not_empty_notify.notify_waiters(); + } + } + + /// Add a completed fetch to the buffer + pub fn add(&self, completed_fetch: Box) { + let table_bucket = completed_fetch.table_bucket(); + let mut pending_map = self.pending_fetches.lock(); + + if let Some(pendings) = pending_map.get_mut(table_bucket) + && !pendings.is_empty() + { + pendings.push_back(Box::new(CompletedPendingFetch::new(completed_fetch))); + return; + } + // If there's no pending fetch for this table_bucket, + // directly add to completed_fetches + self.completed_fetches.lock().push_back(completed_fetch); + self.not_empty_notify.notify_waiters(); + } + + /// Poll the next completed fetch + pub fn poll(&self) -> Option> { + self.completed_fetches.lock().pop_front() + } + + /// Get the next in line fetch + pub fn next_in_line_fetch(&self) -> Option> { + self.next_in_line_fetch.lock().take() + } + + /// Set the next in line fetch + pub fn set_next_in_line_fetch(&self, fetch: Option>) { + *self.next_in_line_fetch.lock() = fetch; + } + + /// Get the set of buckets that have buffered data + pub fn buffered_buckets(&self) -> Vec { + let mut buckets = Vec::new(); + + let next_in_line_fetch = self.next_in_line_fetch.lock(); + if let Some(complete_fetch) = next_in_line_fetch.as_ref() { + if !complete_fetch.is_consumed() { + buckets.push(complete_fetch.table_bucket().clone()); + } + } + + let completed = self.completed_fetches.lock(); + for fetch in completed.iter() { + buckets.push(fetch.table_bucket().clone()); + } + let pending = self.pending_fetches.lock(); + buckets.extend(pending.keys().cloned()); + buckets + } +} + +impl Default for LogFetchBuffer { + fn default() -> Self { + Self::new() + } +} + +/// A wrapper that makes a completed fetch look like a pending fetch +struct CompletedPendingFetch { + completed_fetch: Box, +} + +impl CompletedPendingFetch { + fn new(completed_fetch: Box) -> Self { + Self { completed_fetch } + } +} + +impl PendingFetch for CompletedPendingFetch { + fn table_bucket(&self) -> &TableBucket { + self.completed_fetch.table_bucket() + } + + fn is_completed(&self) -> bool { + true + } + + fn to_completed_fetch(self: Box) -> Result> { + Ok(self.completed_fetch) + } +} + +/// Default implementation of CompletedFetch for in-memory log records +pub struct DefaultCompletedFetch { + table_bucket: TableBucket, + log_record_batch: LogRecordsBatches, + read_context: ReadContext, + next_fetch_offset: i64, + high_watermark: i64, + size_in_bytes: usize, + consumed: bool, + initialized: bool, + records_read: usize, + current_record_iterator: Option, + current_record_batch: Option, +} + +impl DefaultCompletedFetch { + pub fn new( + table_bucket: TableBucket, + log_record_batch: LogRecordsBatches, + size_in_bytes: usize, + read_context: ReadContext, + fetch_offset: i64, + high_watermark: i64, + ) -> Result { + Ok(Self { + table_bucket, + log_record_batch, + read_context, + next_fetch_offset: fetch_offset, + high_watermark, + size_in_bytes, + consumed: false, + initialized: false, + records_read: 0, + current_record_iterator: None, + current_record_batch: None, + }) + } + + /// Get the next fetched record, handling batch iteration and record skipping + fn next_fetched_record(&mut self) -> Result> { + loop { + if let Some(record) = self + .current_record_iterator + .as_mut() + .and_then(Iterator::next) + { + if record.offset() >= self.next_fetch_offset { + return Ok(Some(record)); + } + } else if let Some(batch) = self.log_record_batch.next() { + self.current_record_iterator = Some(batch.records(&self.read_context)?); + self.current_record_batch = Some(batch); + } else { + if let Some(batch) = self.current_record_batch.take() { + self.next_fetch_offset = batch.next_log_offset(); + } + self.drain(); + return Ok(None); + } + } + } +} + +impl CompletedFetch for DefaultCompletedFetch { + fn table_bucket(&self) -> &TableBucket { + &self.table_bucket + } + + fn fetch_records(&mut self, max_records: usize) -> Result> { + // todo: handle corrupt_last_record + if self.consumed { + return Ok(Vec::new()); + } + + let mut scan_records = Vec::new(); + + for _ in 0..max_records { + if let Some(record) = self.next_fetched_record()? { + self.next_fetch_offset = record.offset() + 1; + self.records_read += 1; + scan_records.push(record); + } else { + break; + } + } + + Ok(scan_records) + } + + fn is_consumed(&self) -> bool { + self.consumed + } + + fn drain(&mut self) { + self.consumed = true; + } + + fn size_in_bytes(&self) -> usize { + self.size_in_bytes + } + + fn high_watermark(&self) -> i64 { + self.high_watermark + } + + fn is_initialized(&self) -> bool { + self.initialized + } + + fn set_initialized(&mut self) { + self.initialized = true; + } + + fn next_fetch_offset(&self) -> i64 { + self.next_fetch_offset + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/mod.rs b/fluss-rust/crates/fluss/src/client/table/mod.rs index 99722477c5..e2cf9e6d5e 100644 --- a/fluss-rust/crates/fluss/src/client/table/mod.rs +++ b/fluss-rust/crates/fluss/src/client/table/mod.rs @@ -26,6 +26,7 @@ pub const EARLIEST_OFFSET: i64 = -2; mod append; +mod log_fetch_buffer; mod remote_log; mod scanner; mod writer; diff --git a/fluss-rust/crates/fluss/src/client/table/remote_log.rs b/fluss-rust/crates/fluss/src/client/table/remote_log.rs index 10273dde2e..d9abd19c29 100644 --- a/fluss-rust/crates/fluss/src/client/table/remote_log.rs +++ b/fluss-rust/crates/fluss/src/client/table/remote_log.rs @@ -14,16 +14,18 @@ // See the License for the specific language governing permissions and // limitations under the License. +use crate::client::table::log_fetch_buffer::{CompletedFetch, DefaultCompletedFetch, PendingFetch}; use crate::error::{Error, Result}; use crate::io::{FileIO, Storage}; use crate::metadata::TableBucket; use crate::proto::{PbRemoteLogFetchInfo, PbRemoteLogSegment}; -use crate::record::{LogRecordsBatchs, ReadContext, ScanRecord}; +use crate::record::{LogRecordsBatches, ReadContext}; use crate::util::delete_file; -use parking_lot::RwLock; +use parking_lot::{Mutex, RwLock}; use std::collections::HashMap; use std::io; use std::path::{Path, PathBuf}; +use std::sync::Arc; use tempfile::TempDir; use tokio::io::AsyncWriteExt; use tokio::sync::oneshot; @@ -70,45 +72,121 @@ pub struct RemoteLogFetchInfo { } impl RemoteLogFetchInfo { - pub fn from_proto(info: &PbRemoteLogFetchInfo, table_bucket: TableBucket) -> Result { + pub fn from_proto(info: &PbRemoteLogFetchInfo, table_bucket: TableBucket) -> Self { let segments = info .remote_log_segments .iter() .map(|s| RemoteLogSegment::from_proto(s, table_bucket.clone())) .collect(); - Ok(Self { + Self { remote_log_tablet_dir: info.remote_log_tablet_dir.clone(), partition_name: info.partition_name.clone(), remote_log_segments: segments, first_start_pos: info.first_start_pos.unwrap_or(0), - }) + } } } +type CompletionCallback = Box; + /// Future for a remote log download request pub struct RemoteLogDownloadFuture { - receiver: Option>>, + result: Arc>>>>, + completion_callbacks: Arc>>, + // todo: add recycleCallback } impl RemoteLogDownloadFuture { - pub fn new(receiver: oneshot::Receiver>) -> Self { + pub fn new(receiver: oneshot::Receiver>>) -> Self { + let result = Arc::new(Mutex::new(None)); + let result_clone = Arc::clone(&result); + let completion_callbacks: Arc>> = + Arc::new(Mutex::new(Vec::new())); + let callbacks_clone = Arc::clone(&completion_callbacks); + + // Spawn a task to wait for the download and update result, then call callbacks + tokio::spawn(async move { + let download_result = match receiver.await { + Ok(Ok(path)) => Ok(path), + Ok(Err(e)) => Err(e), + Err(e) => Err(Error::UnexpectedError { + message: format!("Download & Read future cancelled: {e:?}"), + source: None, + }), + }; + + *result_clone.lock() = Some(download_result); + + // Call all registered callbacks + // We need to take the callbacks to avoid holding the lock while calling them + // This also ensures that any callbacks registered after this point will be called immediately + let callbacks: Vec = { + let mut callbacks_guard = callbacks_clone.lock(); + std::mem::take(&mut *callbacks_guard) + }; + for callback in callbacks { + callback(); + } + + // After calling callbacks, any new callbacks registered will see is_done() == true + // and will be called immediately in on_complete() + }); + Self { - receiver: Some(receiver), + result, + completion_callbacks, } } - /// Get the downloaded file path - pub async fn get_file_path(&mut self) -> Result { - let receiver = self.receiver.take().ok_or_else(|| Error::UnexpectedError { - message: "Downloaded file already consumed".to_string(), - source: None, - })?; - - receiver.await.map_err(|e| Error::UnexpectedError { - message: format!("Download future cancelled: {e:?}"), - source: None, - })? + /// Register a callback to be called when download completes (similar to Java's onComplete) + pub fn on_complete(&self, callback: F) + where + F: Fn() + Send + Sync + 'static, + { + // Acquire callbacks lock first to ensure atomicity of the check-and-register operation + let mut callbacks_guard = self.completion_callbacks.lock(); + + // Check completion status while holding the callbacks lock. + // This ensures that: + // 1. If the task completes between checking is_done() and registering the callback, + // we'll see the completion state correctly + // 2. The background task cannot clear the callbacks list while we're checking/registering + let is_done = self.result.lock().is_some(); + + if is_done { + // If already completed, call immediately (drop lock first to avoid deadlock) + drop(callbacks_guard); + callback(); + } else { + // Register the callback while holding the callbacks lock. + // This ensures that even if the background task completes right after we check + // is_done(), it will wait for us to release the lock before taking callbacks. + // When it does take callbacks, it will see our callback in the list and execute it. + callbacks_guard.push(Box::new(callback)); + // Lock is automatically released here + } + } + + pub fn is_done(&self) -> bool { + self.result.lock().is_some() + } + + /// Get the downloaded file path (synchronous, only works after is_done() returns true) + pub fn get_remote_log_bytes(&self) -> Result> { + // todo: handle download fail + let guard = self.result.lock(); + match guard.as_ref() { + Some(Ok(path)) => Ok(path.clone()), + Some(Err(e)) => Err(Error::IoUnexpectedError { + message: format!("Fail to get remote log bytes: {e}"), + source: io::Error::other(format!("{e:?}")), + }), + None => Err(Error::IoUnexpectedError { + message: "Get remote log bytes not completed yet".to_string(), + source: io::Error::other("Get remote log bytes not completed yet"), + }), + } } } @@ -135,25 +213,38 @@ impl RemoteLogDownloader { &self, remote_log_tablet_dir: &str, segment: &RemoteLogSegment, - ) -> Result { + ) -> RemoteLogDownloadFuture { let (sender, receiver) = oneshot::channel(); let local_file_name = segment.local_file_name(); let local_file_path = self.local_log_dir.path().join(&local_file_name); let remote_path = self.build_remote_path(remote_log_tablet_dir, segment); let remote_log_tablet_dir = remote_log_tablet_dir.to_string(); let remote_fs_props = self.remote_fs_props.read().clone(); - // Spawn async download task + // Spawn async download & read task tokio::spawn(async move { - let result = Self::download_file( - &remote_log_tablet_dir, - &remote_path, - &local_file_path, - &remote_fs_props, - ) + let result = async { + let file_path = Self::download_file( + &remote_log_tablet_dir, + &remote_path, + &local_file_path, + &remote_fs_props, + ) + .await?; + let bytes = tokio::fs::read(&file_path).await?; + + // Delete the downloaded local file to free disk (async, but we'll do it in background) + let file_path_clone = file_path.clone(); + tokio::spawn(async move { + let _ = delete_file(file_path_clone).await; + }); + + Ok(bytes) + } .await; + let _ = sender.send(result); }); - Ok(RemoteLogDownloadFuture::new(receiver)) + RemoteLogDownloadFuture::new(receiver) } /// Build the remote path for a log segment @@ -261,9 +352,7 @@ pub struct RemotePendingFetch { segment: RemoteLogSegment, download_future: RemoteLogDownloadFuture, pos_in_log_segment: i32, - #[allow(dead_code)] fetch_offset: i64, - #[allow(dead_code)] high_watermark: i64, read_context: ReadContext, } @@ -286,32 +375,42 @@ impl RemotePendingFetch { read_context, } } +} + +impl PendingFetch for RemotePendingFetch { + fn table_bucket(&self) -> &TableBucket { + &self.segment.table_bucket + } - /// Convert to completed fetch by reading the downloaded file - pub async fn convert_to_completed_fetch( - mut self, - ) -> Result>> { - let file_path = self.download_future.get_file_path().await?; - let file_data = tokio::fs::read(&file_path).await?; + fn is_completed(&self) -> bool { + self.download_future.is_done() + } + + fn to_completed_fetch(self: Box) -> Result> { + // Get the file path (this should only be called when is_completed() returns true) + let mut data = self.download_future.get_remote_log_bytes()?; // Slice the data if needed let data = if self.pos_in_log_segment > 0 { - &file_data[self.pos_in_log_segment as usize..] + data.split_off(self.pos_in_log_segment as usize) } else { - &file_data + data }; - // delete the downloaded local file to free disk - delete_file(file_path).await; + let size_in_bytes = data.len(); - // Parse log records (remote log contains full data, need client-side projection) - let mut fetch_records = vec![]; - for log_record in &mut LogRecordsBatchs::new(data) { - fetch_records.extend(log_record.records_for_remote_log(&self.read_context)?); - } + let log_record_batch = LogRecordsBatches::new(data); + + // Create DefaultCompletedFetch from the data + let completed_fetch = DefaultCompletedFetch::new( + self.segment.table_bucket, + log_record_batch, + size_in_bytes, + self.read_context, + self.fetch_offset, + self.high_watermark, + )?; - let mut result = HashMap::new(); - result.insert(self.segment.table_bucket.clone(), fetch_records); - Ok(result) + Ok(Box::new(completed_fetch)) } } diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index a9384d905a..2246e2cbae 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -18,24 +18,27 @@ use crate::client::connection::FlussConnection; use crate::client::credentials::CredentialsCache; use crate::client::metadata::Metadata; +use crate::client::table::log_fetch_buffer::{ + CompletedFetch, DefaultCompletedFetch, LogFetchBuffer, +}; +use crate::client::table::remote_log::{ + RemoteLogDownloader, RemoteLogFetchInfo, RemotePendingFetch, +}; use crate::error::{Error, Result}; use crate::metadata::{TableBucket, TableInfo, TablePath}; use crate::proto::{FetchLogRequest, PbFetchLogReqForBucket, PbFetchLogReqForTable}; -use crate::record::{LogRecordsBatchs, ReadContext, ScanRecord, ScanRecords, to_arrow_schema}; -use crate::rpc::RpcClient; +use crate::record::{LogRecordsBatches, ReadContext, ScanRecord, ScanRecords, to_arrow_schema}; +use crate::rpc::{RpcClient, message}; use crate::util::FairBucketStatusMap; use arrow_schema::SchemaRef; -use parking_lot::RwLock; -use std::collections::HashMap; +use log::{debug, error, warn}; +use parking_lot::{Mutex, RwLock}; +use std::collections::{HashMap, HashSet}; use std::slice::from_ref; use std::sync::Arc; use std::time::Duration; use tempfile::TempDir; -use crate::client::table::remote_log::{ - RemoteLogDownloader, RemoteLogFetchInfo, RemotePendingFetch, -}; - const LOG_FETCH_MAX_BYTES: i32 = 16 * 1024 * 1024; #[allow(dead_code)] const LOG_FETCH_MAX_BYTES_FOR_BUCKET: i32 = 1024; @@ -171,8 +174,43 @@ impl LogScanner { }) } - pub async fn poll(&self, _timeout: Duration) -> Result { - Ok(ScanRecords::new(self.poll_for_fetches().await?)) + pub async fn poll(&self, timeout: Duration) -> Result { + let start = std::time::Instant::now(); + let deadline = start + timeout; + + loop { + // Try to collect fetches + let fetch_result = self.poll_for_fetches().await?; + + if !fetch_result.is_empty() { + // We have data, send next round of fetches and return + // This enables pipelining while user processes the data + self.log_fetcher.send_fetches().await?; + return Ok(ScanRecords::new(fetch_result)); + } + + // No data available, check if we should wait + let now = std::time::Instant::now(); + if now >= deadline { + // Timeout reached, return empty result + return Ok(ScanRecords::new(HashMap::new())); + } + + // Wait for buffer to become non-empty with remaining time + let remaining = deadline - now; + let has_data = self + .log_fetcher + .log_fetch_buffer + .await_not_empty(remaining) + .await; + + if !has_data { + // Timeout while waiting + return Ok(ScanRecords::new(HashMap::new())); + } + + // Buffer became non-empty, try again + } } pub async fn subscribe(&self, bucket: i32, offset: i64) -> Result<()> { @@ -208,20 +246,31 @@ impl LogScanner { } async fn poll_for_fetches(&self) -> Result>> { - self.log_fetcher.send_fetches_and_collect().await + let result = self.log_fetcher.collect_fetches()?; + if !result.is_empty() { + return Ok(result); + } + + // send any new fetches (won't resend pending fetches). + self.log_fetcher.send_fetches().await?; + + // Collect completed fetches from buffer + self.log_fetcher.collect_fetches() } } -#[allow(dead_code)] struct LogFetcher { - table_path: TablePath, conns: Arc, - table_info: TableInfo, metadata: Arc, log_scanner_status: Arc, read_context: ReadContext, - remote_log_downloader: RemoteLogDownloader, - credentials_cache: CredentialsCache, + remote_read_context: ReadContext, + remote_log_downloader: Arc, + // todo: consider schedule a background thread to update + // token instead of update in fetch phase + credentials_cache: Arc, + log_fetch_buffer: Arc, + nodes_with_pending_fetch_requests: Arc>>, } impl LogFetcher { @@ -233,126 +282,306 @@ impl LogFetcher { projected_fields: Option>, ) -> Result { let full_arrow_schema = to_arrow_schema(table_info.get_row_type()); - let read_context = Self::create_read_context(full_arrow_schema, projected_fields.clone()); + let read_context = + Self::create_read_context(full_arrow_schema.clone(), projected_fields.clone(), false); + let remote_read_context = + Self::create_read_context(full_arrow_schema, projected_fields.clone(), true); let tmp_dir = TempDir::with_prefix("fluss-remote-logs")?; Ok(LogFetcher { - table_path: table_info.table_path.clone(), - conns, - table_info, - metadata, + conns: conns.clone(), + metadata: metadata.clone(), log_scanner_status, read_context, - remote_log_downloader: RemoteLogDownloader::new(tmp_dir)?, - credentials_cache: CredentialsCache::new(), + remote_read_context, + remote_log_downloader: Arc::new(RemoteLogDownloader::new(tmp_dir)?), + credentials_cache: Arc::new(CredentialsCache::new(conns.clone(), metadata.clone())), + log_fetch_buffer: Arc::new(LogFetchBuffer::new()), + nodes_with_pending_fetch_requests: Arc::new(Mutex::new(HashSet::new())), }) } fn create_read_context( full_arrow_schema: SchemaRef, projected_fields: Option>, + is_from_remote: bool, ) -> ReadContext { match projected_fields { - None => ReadContext::new(full_arrow_schema), - Some(fields) => ReadContext::with_projection_pushdown(full_arrow_schema, fields), + None => ReadContext::new(full_arrow_schema, is_from_remote), + Some(fields) => { + ReadContext::with_projection_pushdown(full_arrow_schema, fields, is_from_remote) + } } } - async fn send_fetches_and_collect(&self) -> Result>> { + /// Send fetch requests asynchronously without waiting for responses + async fn send_fetches(&self) -> Result<()> { + // todo: check update metadata like fluss-java in case leader changes let fetch_request = self.prepare_fetch_log_requests().await; - let mut result: HashMap> = HashMap::new(); + for (leader, fetch_request) in fetch_request { - let cluster = self.metadata.get_cluster(); - let server_node = cluster - .get_tablet_server(leader) - .expect("todo: handle leader not exist."); - let con = self.conns.get_connection(server_node).await?; - - let fetch_response = con - .request(crate::rpc::message::FetchLogRequest::new(fetch_request)) - .await?; - - for pb_fetch_log_resp in fetch_response.tables_resp { - let table_id = pb_fetch_log_resp.table_id; - let fetch_log_for_buckets = pb_fetch_log_resp.buckets_resp; - - for fetch_log_for_bucket in fetch_log_for_buckets { - let bucket: i32 = fetch_log_for_bucket.bucket_id; - let table_bucket = TableBucket::new(table_id, bucket); - - // Check if this is a remote log fetch - if let Some(ref remote_log_fetch_info) = - fetch_log_for_bucket.remote_log_fetch_info - { - let remote_fs_props = self - .credentials_cache - .get_or_refresh(&self.conns, &self.metadata) - .await?; - self.remote_log_downloader - .set_remote_fs_props(remote_fs_props); - let remote_fetch_info = RemoteLogFetchInfo::from_proto( - remote_log_fetch_info, - table_bucket.clone(), - )?; - - if let Some(fetch_offset) = - self.log_scanner_status.get_bucket_offset(&table_bucket) - { - let high_watermark = fetch_log_for_bucket.high_watermark.unwrap_or(-1); - // Download and process remote log segments - let mut pos_in_log_segment = remote_fetch_info.first_start_pos; - let mut current_fetch_offset = fetch_offset; - // todo: make segment download in parallel - for (i, segment) in - remote_fetch_info.remote_log_segments.iter().enumerate() - { - if i > 0 { - pos_in_log_segment = 0; - current_fetch_offset = segment.start_offset; - } + debug!("Adding pending request for node id {leader}"); + // Check if we already have a pending request for this node + { + self.nodes_with_pending_fetch_requests.lock().insert(leader); + } + + let cluster = self.metadata.get_cluster().clone(); + + let conns = Arc::clone(&self.conns); + let log_fetch_buffer = self.log_fetch_buffer.clone(); + let log_scanner_status = self.log_scanner_status.clone(); + let read_context = self.read_context.clone(); + let remote_read_context = self.remote_read_context.clone(); + let remote_log_downloader = Arc::clone(&self.remote_log_downloader); + let creds_cache = self.credentials_cache.clone(); + let nodes_with_pending = self.nodes_with_pending_fetch_requests.clone(); + + // Spawn async task to handle the fetch request + // Note: These tasks are not explicitly tracked or cancelled when LogFetcher is dropped. + // This is acceptable because: + // 1. Tasks will naturally complete (network requests will return or timeout) + // 2. Tasks use Arc references, so resources are properly shared + // 3. When the program exits, tokio runtime will clean up all tasks + // 4. Tasks are short-lived (network I/O operations) + tokio::spawn(async move { + // make sure it will always remove leader from pending nodes + let _guard = scopeguard::guard((), |_| { + nodes_with_pending.lock().remove(&leader); + }); + + let server_node = cluster + .get_tablet_server(leader) + .expect("todo: handle leader not exist."); + + let con = match conns.get_connection(server_node).await { + Ok(con) => con, + Err(e) => { + // todo: handle failed to get connection + warn!("Failed to get connection to destination node: {e:?}"); + return; + } + }; + + let fetch_response = match con + .request(message::FetchLogRequest::new(fetch_request)) + .await + { + Ok(resp) => resp, + Err(e) => { + // todo: handle fetch log from destination node + warn!("Failed to fetch log from destination node {server_node:?}: {e:?}"); + return; + } + }; + + if let Err(e) = Self::handle_fetch_response( + fetch_response, + &log_fetch_buffer, + &log_scanner_status, + &read_context, + &remote_read_context, + &remote_log_downloader, + &creds_cache, + ) + .await + { + // todo: handle fail to handle fetch response + error!("Fail to handle fetch response: {e:?}"); + } + }); + } + + Ok(()) + } - let download_future = - self.remote_log_downloader.request_remote_log( - &remote_fetch_info.remote_log_tablet_dir, - segment, - )?; - let pending_fetch = RemotePendingFetch::new( - segment.clone(), - download_future, - pos_in_log_segment, - current_fetch_offset, - high_watermark, - self.read_context.clone(), - ); - let remote_records = - pending_fetch.convert_to_completed_fetch().await?; - // Update offset and merge results - for (tb, records) in remote_records { - if let Some(last_record) = records.last() { - self.log_scanner_status - .update_offset(&tb, last_record.offset() + 1); - } - result.entry(tb).or_default().extend(records); + /// Handle fetch response and add completed fetches to buffer + async fn handle_fetch_response( + fetch_response: crate::proto::FetchLogResponse, + log_fetch_buffer: &Arc, + log_scanner_status: &Arc, + read_context: &ReadContext, + remote_read_context: &ReadContext, + remote_log_downloader: &Arc, + credentials_cache: &Arc, + ) -> Result<()> { + for pb_fetch_log_resp in fetch_response.tables_resp { + let table_id = pb_fetch_log_resp.table_id; + let fetch_log_for_buckets = pb_fetch_log_resp.buckets_resp; + + for fetch_log_for_bucket in fetch_log_for_buckets { + let bucket: i32 = fetch_log_for_bucket.bucket_id; + let table_bucket = TableBucket::new(table_id, bucket); + + // todo: check fetch result code for per-bucket + let Some(fetch_offset) = log_scanner_status.get_bucket_offset(&table_bucket) else { + debug!( + "Ignoring fetch log response for bucket {table_bucket} because the bucket has been unsubscribed." + ); + continue; + }; + + // Check if this is a remote log fetch + if let Some(ref remote_log_fetch_info) = fetch_log_for_bucket.remote_log_fetch_info + { + // set remote fs props + let remote_fs_props = credentials_cache.get_or_refresh().await?; + remote_log_downloader.set_remote_fs_props(remote_fs_props); + + let remote_fetch_info = + RemoteLogFetchInfo::from_proto(remote_log_fetch_info, table_bucket.clone()); + + let high_watermark = fetch_log_for_bucket.high_watermark.unwrap_or(-1); + Self::pending_remote_fetches( + remote_log_downloader.clone(), + log_fetch_buffer.clone(), + remote_read_context.clone(), + &table_bucket, + remote_fetch_info, + fetch_offset, + high_watermark, + ); + } else if fetch_log_for_bucket.records.is_some() { + // Handle regular in-memory records - create completed fetch directly + let high_watermark = fetch_log_for_bucket.high_watermark.unwrap_or(-1); + let records = fetch_log_for_bucket.records.unwrap_or(vec![]); + let size_in_bytes = records.len(); + let log_record_batch = LogRecordsBatches::new(records); + + match DefaultCompletedFetch::new( + table_bucket.clone(), + log_record_batch, + size_in_bytes, + read_context.clone(), + fetch_offset, + high_watermark, + ) { + Ok(completed_fetch) => { + log_fetch_buffer.add(Box::new(completed_fetch)); + } + Err(e) => { + // todo: handle error + log::warn!("Failed to create completed fetch: {e:?}"); + } + } + } + } + } + Ok(()) + } + + fn pending_remote_fetches( + remote_log_downloader: Arc, + log_fetch_buffer: Arc, + read_context: ReadContext, + table_bucket: &TableBucket, + remote_fetch_info: RemoteLogFetchInfo, + fetch_offset: i64, + high_watermark: i64, + ) { + // Download and process remote log segments + let mut pos_in_log_segment = remote_fetch_info.first_start_pos; + let mut current_fetch_offset = fetch_offset; + for (i, segment) in remote_fetch_info.remote_log_segments.iter().enumerate() { + if i > 0 { + pos_in_log_segment = 0; + current_fetch_offset = segment.start_offset; + } + + // todo: + // 1: control the max threads to download remote segment + // 2: introduce priority queue to priority highest for earliest segment + let download_future = remote_log_downloader + .request_remote_log(&remote_fetch_info.remote_log_tablet_dir, segment); + + // Register callback to be called when download completes + // (similar to Java's downloadFuture.onComplete) + // This must be done before creating RemotePendingFetch to avoid move issues + let table_bucket = table_bucket.clone(); + let log_fetch_buffer_clone = log_fetch_buffer.clone(); + download_future.on_complete(move || { + log_fetch_buffer_clone.try_complete(&table_bucket); + }); + + let pending_fetch = RemotePendingFetch::new( + segment.clone(), + download_future, + pos_in_log_segment, + current_fetch_offset, + high_watermark, + read_context.clone(), + ); + // Add to pending fetches in buffer (similar to Java's logFetchBuffer.pend) + log_fetch_buffer.pend(Box::new(pending_fetch)); + } + } + + /// Collect completed fetches from buffer + /// Reference: LogFetchCollector.collectFetch in Java + fn collect_fetches(&self) -> Result>> { + const MAX_POLL_RECORDS: usize = 500; // Default max poll records + let mut result: HashMap> = HashMap::new(); + let mut records_remaining = MAX_POLL_RECORDS; + + while records_remaining > 0 { + // Get the next in line fetch, or get a new one from buffer + let next_in_line = self.log_fetch_buffer.next_in_line_fetch(); + + if next_in_line.is_none() || next_in_line.as_ref().unwrap().is_consumed() { + // Get a new fetch from buffer + if let Some(completed_fetch) = self.log_fetch_buffer.poll() { + // Initialize the fetch if not already initialized + if !completed_fetch.is_initialized() { + let size_in_bytes = completed_fetch.size_in_bytes(); + match self.initialize_fetch(completed_fetch) { + Ok(initialized) => { + self.log_fetch_buffer.set_next_in_line_fetch(initialized); + continue; + } + Err(e) => { + // Remove a completedFetch upon a parse with exception if + // (1) it contains no records, and + // (2) there are no fetched records with actual content preceding this + // exception. + if result.is_empty() && size_in_bytes == 0 { + // todo: do we need to consider it like java ? + // self.log_fetch_buffer.poll(); } + return Err(e); } - } else { - // if the offset is null, it means the bucket has been unsubscribed, - // skip processing and continue to the next bucket. - continue; } - } else if fetch_log_for_bucket.records.is_some() { - // Handle regular in-memory records - let mut fetch_records = vec![]; - let data = fetch_log_for_bucket.records.unwrap(); - for log_record in &mut LogRecordsBatchs::new(&data) { - let last_offset = log_record.last_log_offset(); - fetch_records.extend(log_record.records(&self.read_context)?); - self.log_scanner_status - .update_offset(&table_bucket, last_offset + 1); - } - result.insert(table_bucket, fetch_records); + } else { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(completed_fetch)); + } + // Note: poll() already removed the fetch from buffer, so no need to call poll() + } else { + // No more fetches available + break; + } + } else { + // Fetch records from next_in_line + if let Some(mut next_fetch) = next_in_line { + let records = + self.fetch_records_from_fetch(&mut next_fetch, records_remaining)?; + + if !records.is_empty() { + let table_bucket = next_fetch.table_bucket().clone(); + // Merge with existing records for this bucket + let existing = result.entry(table_bucket).or_default(); + let records_count = records.len(); + existing.extend(records); + + records_remaining = records_remaining.saturating_sub(records_count); } + + // If the fetch is not fully consumed, put it back for the next round + if !next_fetch.is_consumed() { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(next_fetch)); + } + // If consumed, next_fetch will be dropped here (which is correct) } } } @@ -360,6 +589,83 @@ impl LogFetcher { Ok(result) } + /// Initialize a completed fetch, checking offset match and updating high watermark + fn initialize_fetch( + &self, + mut completed_fetch: Box, + ) -> Result>> { + // todo: handle error in initialize fetch + let table_bucket = completed_fetch.table_bucket(); + let fetch_offset = completed_fetch.next_fetch_offset(); + + // Check if bucket is still subscribed + let Some(current_offset) = self.log_scanner_status.get_bucket_offset(table_bucket) else { + warn!( + "Discarding stale fetch response for bucket {table_bucket:?} since the bucket has been unsubscribed" + ); + return Ok(None); + }; + + // Check if offset matches + if fetch_offset != current_offset { + warn!( + "Discarding stale fetch response for bucket {table_bucket:?} since its offset {fetch_offset} does not match the expected offset {current_offset}" + ); + return Ok(None); + } + + // Update high watermark + let high_watermark = completed_fetch.high_watermark(); + if high_watermark >= 0 { + self.log_scanner_status + .update_high_watermark(table_bucket, high_watermark); + } + + completed_fetch.set_initialized(); + Ok(Some(completed_fetch)) + } + + /// Fetch records from a completed fetch, checking offset match + fn fetch_records_from_fetch( + &self, + next_in_line_fetch: &mut Box, + max_records: usize, + ) -> Result> { + let table_bucket = next_in_line_fetch.table_bucket().clone(); + let current_offset = self.log_scanner_status.get_bucket_offset(&table_bucket); + + if current_offset.is_none() { + warn!( + "Ignoring fetched records for {table_bucket:?} since the bucket has been unsubscribed" + ); + next_in_line_fetch.drain(); + return Ok(Vec::new()); + } + + let current_offset = current_offset.unwrap(); + let fetch_offset = next_in_line_fetch.next_fetch_offset(); + + // Check if this fetch is next in line + if fetch_offset == current_offset { + let records = next_in_line_fetch.fetch_records(max_records)?; + let next_fetch_offset = next_in_line_fetch.next_fetch_offset(); + + if next_fetch_offset > current_offset { + self.log_scanner_status + .update_offset(&table_bucket, next_fetch_offset); + } + + Ok(records) + } else { + // These records aren't next in line, ignore them + warn!( + "Ignoring fetched records for {table_bucket:?} at offset {fetch_offset} since the current offset is {current_offset}" + ); + next_in_line_fetch.drain(); + Ok(Vec::new()) + } + } + async fn prepare_fetch_log_requests(&self) -> HashMap { let mut fetch_log_req_for_buckets = HashMap::new(); let mut table_id = None; @@ -372,25 +678,44 @@ impl LogFetcher { let offset = match self.log_scanner_status.get_bucket_offset(&bucket) { Some(offset) => offset, None => { - // todo: debug + debug!( + "Skipping fetch request for bucket {bucket} because the bucket has been unsubscribed." + ); continue; } }; - if let Some(leader) = self.get_table_bucket_leader(&bucket) { - let fetch_log_req_for_bucket = PbFetchLogReqForBucket { - partition_id: None, - bucket_id: bucket.bucket_id(), - fetch_offset: offset, - // 1M - max_fetch_bytes: 1024 * 1024, - }; - - fetch_log_req_for_buckets - .entry(leader) - .or_insert_with(Vec::new) - .push(fetch_log_req_for_bucket); - ready_for_fetch_count += 1; + match self.get_table_bucket_leader(&bucket) { + None => { + log::trace!( + "Skipping fetch request for bucket {bucket} because leader is not available." + ) + } + Some(leader) => { + if self + .nodes_with_pending_fetch_requests + .lock() + .contains(&leader) + { + log::trace!( + "Skipping fetch request for bucket {bucket} because previous request to server {leader} has not been processed." + ) + } else { + let fetch_log_req_for_bucket = PbFetchLogReqForBucket { + partition_id: None, + bucket_id: bucket.bucket_id(), + fetch_offset: offset, + // 1M + max_fetch_bytes: 1024 * 1024, + }; + + fetch_log_req_for_buckets + .entry(leader) + .or_insert_with(Vec::new) + .push(fetch_log_req_for_bucket); + ready_for_fetch_count += 1; + } + } } } @@ -427,8 +752,11 @@ impl LogFetcher { } fn fetchable_buckets(&self) -> Vec { - // always available now - self.log_scanner_status.fetchable_buckets(|_| true) + // Get buckets that are not already in the buffer + let buffered = self.log_fetch_buffer.buffered_buckets(); + let buffered_set: HashSet = buffered.into_iter().collect(); + self.log_scanner_status + .fetchable_buckets(|tb| !buffered_set.contains(tb)) } fn get_table_bucket_leader(&self, tb: &TableBucket) -> Option { diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index 9295713cc2..0a803aec6d 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -39,6 +39,7 @@ use arrow_schema::SchemaRef; use arrow_schema::{DataType as ArrowDataType, Field}; use byteorder::WriteBytesExt; use byteorder::{ByteOrder, LittleEndian}; +use bytes::Bytes; use crc32c::crc32c; use parking_lot::Mutex; use std::{ @@ -347,17 +348,17 @@ pub trait ToArrow { fn append_to(&self, builder: &mut dyn ArrayBuilder) -> Result<()>; } -pub struct LogRecordsBatchs<'a> { - data: &'a [u8], +pub struct LogRecordsBatches { + data: Bytes, current_pos: usize, remaining_bytes: usize, } -impl<'a> LogRecordsBatchs<'a> { - pub fn new(data: &'a [u8]) -> Self { +impl LogRecordsBatches { + pub fn new(data: Vec) -> Self { let remaining_bytes: usize = data.len(); Self { - data, + data: Bytes::from(data), current_pos: 0, remaining_bytes, } @@ -378,14 +379,17 @@ impl<'a> LogRecordsBatchs<'a> { } } -impl<'a> Iterator for &'a mut LogRecordsBatchs<'a> { - type Item = LogRecordBatch<'a>; +impl Iterator for LogRecordsBatches { + type Item = LogRecordBatch; fn next(&mut self) -> Option { match self.next_batch_size() { Some(batch_size) => { - let data_slice = &self.data[self.current_pos..self.current_pos + batch_size]; - let record_batch = LogRecordBatch::new(data_slice); + let start = self.current_pos; + let end = start + batch_size; + // Since LogRecordsBatches owns the Vec, the slice is valid + // as long as the mutable reference exists, which is 'a + let record_batch = LogRecordBatch::new(self.data.slice(start..end)); self.current_pos += batch_size; self.remaining_bytes -= batch_size; Some(record_batch) @@ -395,13 +399,13 @@ impl<'a> Iterator for &'a mut LogRecordsBatchs<'a> { } } -pub struct LogRecordBatch<'a> { - data: &'a [u8], +pub struct LogRecordBatch { + data: Bytes, } #[allow(dead_code)] -impl<'a> LogRecordBatch<'a> { - pub fn new(data: &'a [u8]) -> Self { +impl LogRecordBatch { + pub fn new(data: Bytes) -> Self { LogRecordBatch { data } } @@ -710,6 +714,7 @@ pub struct ReadContext { target_schema: SchemaRef, full_schema: SchemaRef, projection: Option, + is_from_remote: bool, } #[derive(Clone)] @@ -723,24 +728,39 @@ struct Projection { } impl ReadContext { - pub fn new(arrow_schema: SchemaRef) -> ReadContext { + pub fn new(arrow_schema: SchemaRef, is_from_remote: bool) -> ReadContext { ReadContext { target_schema: arrow_schema.clone(), full_schema: arrow_schema, projection: None, + is_from_remote, } } pub fn with_projection_pushdown( arrow_schema: SchemaRef, projected_fields: Vec, + is_from_remote: bool, ) -> ReadContext { let target_schema = Self::project_schema(arrow_schema.clone(), projected_fields.as_slice()); - let mut sorted_fields = projected_fields.clone(); - sorted_fields.sort_unstable(); + // the logic is little bit of hard to understand, to refactor it to follow + // java side + let (need_do_reorder, sorted_fields) = { + // currently, for remote read, arrow log doesn't support projection pushdown, + // so, only need to do reordering when is not from remote + if !is_from_remote { + let mut sorted_fields = projected_fields.clone(); + sorted_fields.sort_unstable(); + (!sorted_fields.eq(&projected_fields), sorted_fields) + } else { + // sorted_fields won't be used when need_do_reorder is false, + // let's use an empty vec directly + (false, vec![]) + } + }; let project = { - if !sorted_fields.eq(&projected_fields) { + if need_do_reorder { // reordering is required // Calculate reordering indexes to transform from sorted order to user-requested order let mut reordering_indexes = Vec::with_capacity(projected_fields.len()); @@ -778,6 +798,7 @@ impl ReadContext { target_schema, full_schema: arrow_schema, projection: Some(project), + is_from_remote, } } @@ -805,17 +826,24 @@ impl ReadContext { pub fn record_batch(&self, data: &[u8]) -> Result { let (batch_metadata, body_buffer, version) = parse_ipc_message(data)?; - // the record batch from server must be ordered by field pos, - // according to project to decide what arrow schema to use - // to parse the record batch - let resolve_schema = match self.projection { - Some(ref projection) => { - // projection, should use ordered schema by project field pos - projection.ordered_schema.clone() - } - None => { - // no projection, use target output schema - self.target_schema.clone() + let resolve_schema = { + // if from remote, no projection, need to use full schema + if self.is_from_remote { + self.full_schema.clone() + } else { + // the record batch from server must be ordered by field pos, + // according to project to decide what arrow schema to use + // to parse the record batch + match self.projection { + Some(ref projection) => { + // projection, should use ordered schema by project field pos + projection.ordered_schema.clone() + } + None => { + // no projection, use target output schema + self.target_schema.clone() + } + } } }; @@ -829,14 +857,27 @@ impl ReadContext { )?; let record_batch = match &self.projection { - Some(projection) if projection.reordering_needed => { - // Reorder columns if needed (when projection pushdown with non-sorted order) - let reordered_columns: Vec<_> = projection - .reordering_indexes - .iter() - .map(|&idx| record_batch.column(idx).clone()) - .collect(); - RecordBatch::try_new(self.target_schema.clone(), reordered_columns)? + Some(projection) => { + let reordered_columns = { + // need to do reorder + if self.is_from_remote { + Some(&projection.projected_fields) + } else if projection.reordering_needed { + Some(&projection.reordering_indexes) + } else { + None + } + }; + match reordered_columns { + Some(reordered_columns) => { + let arrow_columns = reordered_columns + .iter() + .map(|&idx| record_batch.column(idx).clone()) + .collect(); + RecordBatch::try_new(self.target_schema.clone(), arrow_columns)? + } + _ => record_batch, + } } _ => record_batch, }; diff --git a/fluss-rust/crates/fluss/tests/integration/table.rs b/fluss-rust/crates/fluss/tests/integration/table.rs index a058bfe0bc..9eec98eaaf 100644 --- a/fluss-rust/crates/fluss/tests/integration/table.rs +++ b/fluss-rust/crates/fluss/tests/integration/table.rs @@ -148,7 +148,7 @@ mod table_test { } let scan_records = log_scanner - .poll(std::time::Duration::from_secs(5)) + .poll(std::time::Duration::from_secs(60)) .await .expect("Failed to poll"); @@ -178,7 +178,7 @@ mod table_test { } let scan_records_projected = log_scanner_projected - .poll(std::time::Duration::from_secs(5)) + .poll(std::time::Duration::from_secs(10)) .await .expect("Failed to poll"); @@ -227,7 +227,7 @@ mod table_test { // Poll for records let scan_records = log_scanner - .poll(tokio::time::Duration::from_secs(5)) + .poll(tokio::time::Duration::from_secs(10)) .await .expect("Failed to poll records"); diff --git a/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs b/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs index ca61ff8502..bdbced95c4 100644 --- a/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs +++ b/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs @@ -175,6 +175,8 @@ mod table_remote_scan_test { let num_buckets = table.table_info().get_num_buckets(); let log_scanner = table .new_scan() + .project(&[1, 0]) + .unwrap() .create_log_scanner() .expect("Failed to create log scanner"); for bucket_id in 0..num_buckets { @@ -186,7 +188,7 @@ mod table_remote_scan_test { let mut records = Vec::with_capacity(record_count); let start = std::time::Instant::now(); - const MAX_WAIT_DURATION: Duration = Duration::from_secs(30); + const MAX_WAIT_DURATION: Duration = Duration::from_secs(60); while records.len() < record_count { if start.elapsed() > MAX_WAIT_DURATION { panic!( @@ -208,8 +210,8 @@ mod table_remote_scan_test { let row = record.row(); let expected_c1 = i as i32; let expected_c2 = format!("v{}", i); - assert_eq!(row.get_int(0), expected_c1, "c1 mismatch at index {}", i); - assert_eq!(row.get_string(1), expected_c2, "c2 mismatch at index {}", i); + assert_eq!(row.get_int(1), expected_c1, "c1 mismatch at index {}", i); + assert_eq!(row.get_string(0), expected_c2, "c2 mismatch at index {}", i); } } From 7515f71591a94be910d4bb220bb81c4dea4ad250 Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Mon, 22 Dec 2025 09:43:43 +0000 Subject: [PATCH 042/287] chore: fix bug where column prune won't work when compression is enabled (#108) --- fluss-rust/bindings/cpp/examples/example.cpp | 1 - .../fluss/src/client/write/accumulator.rs | 2 + .../crates/fluss/src/client/write/batch.rs | 4 + .../src/compression/arrow_compression.rs | 245 ++++++++++++++++++ .../crates/fluss/src/compression/mod.rs | 20 ++ fluss-rust/crates/fluss/src/lib.rs | 1 + fluss-rust/crates/fluss/src/metadata/table.rs | 5 + fluss-rust/crates/fluss/src/record/arrow.rs | 21 +- .../crates/fluss/tests/integration/table.rs | 1 - .../tests/integration/table_remote_scan.rs | 1 - 10 files changed, 296 insertions(+), 5 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/compression/arrow_compression.rs create mode 100644 fluss-rust/crates/fluss/src/compression/mod.rs diff --git a/fluss-rust/bindings/cpp/examples/example.cpp b/fluss-rust/bindings/cpp/examples/example.cpp index 04f9ac64f5..6ff2b9b7e3 100644 --- a/fluss-rust/bindings/cpp/examples/example.cpp +++ b/fluss-rust/bindings/cpp/examples/example.cpp @@ -61,7 +61,6 @@ int main() { auto descriptor = fluss::TableDescriptor::NewBuilder() .SetSchema(schema) .SetBucketCount(3) - .SetProperty("table.log.arrow.compression.type", "NONE") .SetComment("cpp example table with 3 buckets") .Build(); diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs index e4ca957827..215adbe695 100644 --- a/fluss-rust/crates/fluss/src/client/write/accumulator.rs +++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs @@ -94,6 +94,7 @@ impl RecordAccumulator { let table_path = &record.table_path; let table_info = cluster.get_table(table_path); + let arrow_compression_info = table_info.get_table_config().get_arrow_compression_info()?; let row_type = &cluster.get_table(table_path).row_type; let schema_id = table_info.schema_id; @@ -102,6 +103,7 @@ impl RecordAccumulator { self.batch_id.fetch_add(1, Ordering::Relaxed), table_path.as_ref().clone(), schema_id, + arrow_compression_info, row_type, bucket_id, current_time_ms(), diff --git a/fluss-rust/crates/fluss/src/client/write/batch.rs b/fluss-rust/crates/fluss/src/client/write/batch.rs index 13b3d36402..ba04db4ac6 100644 --- a/fluss-rust/crates/fluss/src/client/write/batch.rs +++ b/fluss-rust/crates/fluss/src/client/write/batch.rs @@ -18,6 +18,7 @@ use crate::BucketId; use crate::client::broadcast::{BatchWriteResult, BroadcastOnce}; use crate::client::{ResultHandle, WriteRecord}; +use crate::compression::ArrowCompressionInfo; use crate::error::Result; use crate::metadata::{DataType, TablePath}; use crate::record::MemoryLogRecordsArrowBuilder; @@ -132,10 +133,12 @@ pub struct ArrowLogWriteBatch { } impl ArrowLogWriteBatch { + #[allow(clippy::too_many_arguments)] pub fn new( batch_id: i64, table_path: TablePath, schema_id: i32, + arrow_compression_info: ArrowCompressionInfo, row_type: &DataType, bucket_id: BucketId, create_ms: i64, @@ -148,6 +151,7 @@ impl ArrowLogWriteBatch { schema_id, row_type, to_append_record_batch, + arrow_compression_info, ), } } diff --git a/fluss-rust/crates/fluss/src/compression/arrow_compression.rs b/fluss-rust/crates/fluss/src/compression/arrow_compression.rs new file mode 100644 index 0000000000..32dfadb48c --- /dev/null +++ b/fluss-rust/crates/fluss/src/compression/arrow_compression.rs @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::{Error, Result}; +use arrow::ipc::CompressionType; +use std::collections::HashMap; + +pub const TABLE_LOG_ARROW_COMPRESSION_ZSTD_LEVEL: &str = "table.log.arrow.compression.zstd.level"; +pub const TABLE_LOG_ARROW_COMPRESSION_TYPE: &str = "table.log.arrow.compression.type"; +pub const DEFAULT_NON_ZSTD_COMPRESSION_LEVEL: i32 = -1; +pub const DEFAULT_ZSTD_COMPRESSION_LEVEL: i32 = 3; + +#[derive(Clone, Debug, PartialEq)] +pub enum ArrowCompressionType { + None, + Lz4Frame, + Zstd, +} + +impl ArrowCompressionType { + fn from_conf(properties: &HashMap) -> Result { + match properties + .get(TABLE_LOG_ARROW_COMPRESSION_TYPE) + .map(|s| s.as_str()) + { + Some("NONE") => Ok(Self::None), + Some("LZ4_FRAME") => Ok(Self::Lz4Frame), + Some("ZSTD") => Ok(Self::Zstd), + Some(other) => Err(Error::IllegalArgument { + message: format!("Unsupported compression type: {other}"), + }), + None => Ok(Self::Zstd), + } + } +} + +#[derive(Clone, Debug)] +pub struct ArrowCompressionInfo { + pub compression_type: ArrowCompressionType, + pub compression_level: i32, +} + +impl ArrowCompressionInfo { + pub fn from_conf(properties: &HashMap) -> Result { + let compression_type = ArrowCompressionType::from_conf(properties)?; + + if compression_type != ArrowCompressionType::Zstd { + return Ok(Self { + compression_type, + compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }); + } + + match properties + .get(TABLE_LOG_ARROW_COMPRESSION_ZSTD_LEVEL) + .map(|s| s.as_str().parse::()) + { + Some(Ok(level)) if !(1..=22).contains(&level) => Err(Error::IllegalArgument { + message: format!( + "Invalid ZSTD compression level: {}. Expected a value between 1 and 22.", + level + ), + }), + Some(Err(e)) => Err(Error::IllegalArgument { + message: format!( + "Invalid ZSTD compression level. Expected a value between 1 and 22. {}", + e + ), + }), + + Some(Ok(level)) => Ok(Self { + compression_type, + compression_level: level, + }), + None => Ok(Self { + compression_type, + compression_level: DEFAULT_ZSTD_COMPRESSION_LEVEL, + }), + } + } + + #[cfg(test)] + fn new(compression_type: ArrowCompressionType, compression_level: i32) -> ArrowCompressionInfo { + Self { + compression_type, + compression_level, + } + } + + pub fn get_compression_type(&self) -> Option { + match self.compression_type { + ArrowCompressionType::Zstd => Some(CompressionType::ZSTD), + ArrowCompressionType::Lz4Frame => Some(CompressionType::LZ4_FRAME), + ArrowCompressionType::None => None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + #[test] + fn test_from_conf() { + assert_eq!( + ArrowCompressionType::from_conf(&HashMap::new()).unwrap(), + ArrowCompressionType::Zstd + ); + + assert_eq!( + ArrowCompressionType::from_conf(&mk_map(&[( + "table.log.arrow.compression.type", + "NONE" + )])) + .unwrap(), + ArrowCompressionType::None + ); + + assert_eq!( + ArrowCompressionType::from_conf(&mk_map(&[( + "table.log.arrow.compression.type", + "LZ4_FRAME" + )])) + .unwrap(), + ArrowCompressionType::Lz4Frame + ); + + assert_eq!( + ArrowCompressionType::from_conf(&mk_map(&[( + "table.log.arrow.compression.type", + "ZSTD" + )])) + .unwrap(), + ArrowCompressionType::Zstd + ); + } + + #[test] + fn test_from_conf_invalid_compression_type() { + let props = mk_map(&[("table.log.arrow.compression.type", "FOO")]); + + assert!( + ArrowCompressionInfo::from_conf(&props) + .unwrap_err() + .to_string() + .contains( + "Fluss hitting illegal argument error Unsupported compression type: FOO." + ) + ); + } + + #[test] + fn test_from_conf_zstd_compression_level() { + let compression_info = ArrowCompressionInfo::from_conf(&mk_map(&[( + "table.log.arrow.compression.type", + "ZSTD", + )])); + assert_eq!(compression_info.unwrap().compression_level, 3); + let compression_info = ArrowCompressionInfo::from_conf(&mk_map(&[ + ("table.log.arrow.compression.type", "ZSTD"), + ("table.log.arrow.compression.zstd.level", "1"), + ])); + assert_eq!(compression_info.unwrap().compression_level, 1); + } + + #[test] + fn test_from_conf_compression_level_out_of_range() { + let props = mk_map(&[ + ("table.log.arrow.compression.type", "ZSTD"), + ("table.log.arrow.compression.zstd.level", "0"), + ]); + + assert!( + ArrowCompressionInfo::from_conf(&props) + .unwrap_err() + .to_string() + .contains("Expected a value between 1 and 22.") + ); + + let props = mk_map(&[ + ("table.log.arrow.compression.type", "ZSTD"), + ("table.log.arrow.compression.zstd.level", "23"), + ]); + + assert!( + ArrowCompressionInfo::from_conf(&props) + .unwrap_err() + .to_string() + .contains("Expected a value between 1 and 22.") + ); + } + + #[test] + fn test_from_conf_compression_level_parse_error() { + let props = mk_map(&[ + ("table.log.arrow.compression.type", "ZSTD"), + ("table.log.arrow.compression.zstd.level", "not-a-number"), + ]); + + assert!( + ArrowCompressionInfo::from_conf(&props) + .unwrap_err() + .to_string() + .contains("Expected a value between 1 and 22.") + ); + } + + #[test] + fn get_compression_type_maps_correctly() { + assert_eq!( + ArrowCompressionInfo::new(ArrowCompressionType::None, -1).get_compression_type(), + None + ); + assert_eq!( + ArrowCompressionInfo::new(ArrowCompressionType::Lz4Frame, -1).get_compression_type(), + Some(CompressionType::LZ4_FRAME) + ); + assert_eq!( + ArrowCompressionInfo::new(ArrowCompressionType::Zstd, -1).get_compression_type(), + Some(CompressionType::ZSTD) + ); + } + + fn mk_map(pairs: &[(&str, &str)]) -> HashMap { + pairs + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect() + } +} diff --git a/fluss-rust/crates/fluss/src/compression/mod.rs b/fluss-rust/crates/fluss/src/compression/mod.rs new file mode 100644 index 0000000000..2b86dba77d --- /dev/null +++ b/fluss-rust/crates/fluss/src/compression/mod.rs @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod arrow_compression; + +pub use arrow_compression::*; diff --git a/fluss-rust/crates/fluss/src/lib.rs b/fluss-rust/crates/fluss/src/lib.rs index 366edfc60a..25978ce0bb 100644 --- a/fluss-rust/crates/fluss/src/lib.rs +++ b/fluss-rust/crates/fluss/src/lib.rs @@ -26,6 +26,7 @@ mod cluster; pub mod config; pub mod error; +mod compression; pub mod io; mod util; diff --git a/fluss-rust/crates/fluss/src/metadata/table.rs b/fluss-rust/crates/fluss/src/metadata/table.rs index 770c4f2cfe..4f6c04bc61 100644 --- a/fluss-rust/crates/fluss/src/metadata/table.rs +++ b/fluss-rust/crates/fluss/src/metadata/table.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::compression::ArrowCompressionInfo; use crate::error::Error::InvalidTableError; use crate::error::{Error, Result}; use crate::metadata::datatype::{DataField, DataType, RowType}; @@ -721,6 +722,10 @@ impl TableConfig { pub fn from_properties(properties: HashMap) -> Self { TableConfig { properties } } + + pub fn get_arrow_compression_info(&self) -> Result { + ArrowCompressionInfo::from_conf(&self.properties) + } } impl TableInfo { diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index 0a803aec6d..5a5115edfa 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -16,6 +16,7 @@ // under the License. use crate::client::{Record, WriteRecord}; +use crate::compression::ArrowCompressionInfo; use crate::error::Result; use crate::metadata::DataType; use crate::record::{ChangeType, ScanRecord}; @@ -47,6 +48,7 @@ use std::{ sync::Arc, }; +use arrow::ipc::writer::IpcWriteOptions; /// const for record batch pub const BASE_OFFSET_LENGTH: usize = 8; pub const LENGTH_LENGTH: usize = 4; @@ -104,6 +106,7 @@ pub struct MemoryLogRecordsArrowBuilder { batch_sequence: i32, arrow_record_batch_builder: Box, is_closed: bool, + arrow_compression_info: ArrowCompressionInfo, } pub trait ArrowRecordBatchInnerBuilder: Send + Sync { @@ -244,7 +247,12 @@ impl ArrowRecordBatchInnerBuilder for RowAppendRecordBatchBuilder { } impl MemoryLogRecordsArrowBuilder { - pub fn new(schema_id: i32, row_type: &DataType, to_append_record_batch: bool) -> Self { + pub fn new( + schema_id: i32, + row_type: &DataType, + to_append_record_batch: bool, + arrow_compression_info: ArrowCompressionInfo, + ) -> Self { let arrow_batch_builder: Box = { if to_append_record_batch { Box::new(PrebuiltRecordBatchBuilder::default()) @@ -260,6 +268,7 @@ impl MemoryLogRecordsArrowBuilder { batch_sequence: NO_BATCH_SEQUENCE, is_closed: false, arrow_record_batch_builder: arrow_batch_builder, + arrow_compression_info, } } @@ -289,7 +298,15 @@ impl MemoryLogRecordsArrowBuilder { // serialize arrow batch let mut arrow_batch_bytes = vec![]; let table_schema = self.arrow_record_batch_builder.schema(); - let mut writer = StreamWriter::try_new(&mut arrow_batch_bytes, &table_schema)?; + let compression_type = self.arrow_compression_info.get_compression_type(); + let write_option = + IpcWriteOptions::try_with_compression(IpcWriteOptions::default(), compression_type); + let mut writer = StreamWriter::try_new_with_options( + &mut arrow_batch_bytes, + &table_schema, + write_option?, + )?; + // get header len let header = writer.get_ref().len(); let record_batch = self.arrow_record_batch_builder.build_arrow_record_batch()?; diff --git a/fluss-rust/crates/fluss/tests/integration/table.rs b/fluss-rust/crates/fluss/tests/integration/table.rs index 9eec98eaaf..3f7946ee9f 100644 --- a/fluss-rust/crates/fluss/tests/integration/table.rs +++ b/fluss-rust/crates/fluss/tests/integration/table.rs @@ -103,7 +103,6 @@ mod table_test { .build() .expect("Failed to build schema"), ) - .property("table.log.arrow.compression.type", "NONE") .build() .expect("Failed to build table"); diff --git a/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs b/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs index bdbced95c4..43c89b5468 100644 --- a/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs +++ b/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs @@ -142,7 +142,6 @@ mod table_remote_scan_test { .build() .expect("Failed to build schema"), ) - .property("table.log.arrow.compression.type", "NONE") .build() .expect("Failed to build table"); From 2bf3701aaca36aef16767097644cecb0bf56149a Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Thu, 25 Dec 2025 14:02:50 +0000 Subject: [PATCH 043/287] chore: scanner should retry when bucket lead is not available (#102) --- .../crates/fluss/src/client/credentials.rs | 4 +- .../crates/fluss/src/client/metadata.rs | 51 +++++++++--- .../crates/fluss/src/client/table/scanner.rs | 78 ++++++++++++++++--- .../crates/fluss/src/cluster/cluster.rs | 52 ++++++++++--- .../crates/fluss/src/rpc/server_connection.rs | 25 +++++- 5 files changed, 175 insertions(+), 35 deletions(-) diff --git a/fluss-rust/crates/fluss/src/client/credentials.rs b/fluss-rust/crates/fluss/src/client/credentials.rs index 8adfe48b99..ffb682ed7d 100644 --- a/fluss-rust/crates/fluss/src/client/credentials.rs +++ b/fluss-rust/crates/fluss/src/client/credentials.rs @@ -118,7 +118,9 @@ impl CredentialsCache { async fn refresh_from_server(&self) -> Result> { let cluster = self.metadata.get_cluster(); - let server_node = cluster.get_one_available_server(); + let server_node = cluster + .get_one_available_server() + .expect("no tablet server available"); let conn = self.rpc_client.get_connection(server_node).await?; let request = GetSecurityTokenRequest::new(); diff --git a/fluss-rust/crates/fluss/src/client/metadata.rs b/fluss-rust/crates/fluss/src/client/metadata.rs index 3c3ba4bd2e..a51442254c 100644 --- a/fluss-rust/crates/fluss/src/client/metadata.rs +++ b/fluss-rust/crates/fluss/src/client/metadata.rs @@ -16,38 +16,40 @@ // under the License. use crate::cluster::{Cluster, ServerNode, ServerType}; +use crate::error::Result; use crate::metadata::{TableBucket, TablePath}; +use crate::proto::MetadataResponse; use crate::rpc::message::UpdateMetadataRequest; use crate::rpc::{RpcClient, ServerConnection}; +use log::info; use parking_lot::RwLock; use std::collections::HashSet; use std::net::SocketAddr; use std::sync::Arc; -use crate::error::Result; -use crate::proto::MetadataResponse; - #[derive(Default)] pub struct Metadata { cluster: RwLock>, connections: Arc, + bootstrap: Arc, } impl Metadata { - pub async fn new(boot_strap: &str, connections: Arc) -> Result { - let custer = Self::init_cluster(boot_strap, connections.clone()).await?; + pub async fn new(bootstrap: &str, connections: Arc) -> Result { + let cluster = Self::init_cluster(bootstrap, connections.clone()).await?; Ok(Metadata { - cluster: RwLock::new(Arc::new(custer)), + cluster: RwLock::new(Arc::new(cluster)), connections, + bootstrap: bootstrap.into(), }) } async fn init_cluster(boot_strap: &str, connections: Arc) -> Result { - let socker_addrss = boot_strap.parse::().unwrap(); + let socket_address = boot_strap.parse::().unwrap(); let server_node = ServerNode::new( -1, - socker_addrss.ip().to_string(), - socker_addrss.port() as u32, + socket_address.ip().to_string(), + socket_address.port() as u32, ServerType::CoordinatorServer, ); let con = connections.get_connection(&server_node).await?; @@ -55,6 +57,20 @@ impl Metadata { Cluster::from_metadata_response(response, None) } + async fn reinit_cluster(&self) -> Result<()> { + let cluster = Self::init_cluster(&self.bootstrap, self.connections.clone()).await?; + *self.cluster.write() = cluster.into(); + Ok(()) + } + + pub fn invalidate_server(&self, server_id: &i32, table_ids: Vec) { + // Take a write lock for the entire operation to avoid races between + // reading the current cluster state and writing back the updated one. + let mut cluster_guard = self.cluster.write(); + let updated_cluster = cluster_guard.invalidate_server(server_id, table_ids); + *cluster_guard = Arc::new(updated_cluster); + } + pub async fn update(&self, metadata_response: MetadataResponse) -> Result<()> { let origin_cluster = self.cluster.read().clone(); let new_cluster = @@ -65,7 +81,22 @@ impl Metadata { } pub async fn update_tables_metadata(&self, table_paths: &HashSet<&TablePath>) -> Result<()> { - let server = self.cluster.read().get_one_available_server().clone(); + let maybe_server = { + let guard = self.cluster.read(); + guard.get_one_available_server().cloned() + }; + + let server = match maybe_server { + Some(s) => s, + None => { + info!( + "No available tablet server to update metadata, attempting to re-initialize cluster using bootstrap server." + ); + self.reinit_cluster().await?; + return Ok(()); + } + }; + let conn = self.connections.get_connection(&server).await?; let update_table_paths: Vec<&TablePath> = table_paths.iter().copied().collect(); diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index 2246e2cbae..11bdfa3148 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -24,7 +24,7 @@ use crate::client::table::log_fetch_buffer::{ use crate::client::table::remote_log::{ RemoteLogDownloader, RemoteLogFetchInfo, RemotePendingFetch, }; -use crate::error::{Error, Result}; +use crate::error::{Error, Result, RpcError}; use crate::metadata::{TableBucket, TableInfo, TablePath}; use crate::proto::{FetchLogRequest, PbFetchLogReqForBucket, PbFetchLogReqForTable}; use crate::record::{LogRecordsBatches, ReadContext, ScanRecord, ScanRecords, to_arrow_schema}; @@ -271,6 +271,8 @@ struct LogFetcher { credentials_cache: Arc, log_fetch_buffer: Arc, nodes_with_pending_fetch_requests: Arc>>, + table_path: TablePath, + is_partitioned: bool, } impl LogFetcher { @@ -299,6 +301,8 @@ impl LogFetcher { credentials_cache: Arc::new(CredentialsCache::new(conns.clone(), metadata.clone())), log_fetch_buffer: Arc::new(LogFetchBuffer::new()), nodes_with_pending_fetch_requests: Arc::new(Mutex::new(HashSet::new())), + table_path: table_info.table_path.clone(), + is_partitioned: table_info.is_partitioned(), }) } @@ -315,9 +319,45 @@ impl LogFetcher { } } + async fn check_and_update_metadata(&self) -> Result<()> { + if self.is_partitioned { + // TODO: Implement partition-aware metadata refresh for buckets whose leaders are unknown. + // The implementation will likely need to collect partition IDs for such buckets and + // perform targeted metadata updates. Until then, we avoid computing unused partition_ids. + return Ok(()); + } + + let need_update = self + .fetchable_buckets() + .iter() + .any(|bucket| self.get_table_bucket_leader(bucket).is_none()); + + if !need_update { + return Ok(()); + } + + // TODO: Handle PartitionNotExist error + self.metadata + .update_tables_metadata(&HashSet::from([&self.table_path])) + .await + .or_else(|e| { + if let Error::RpcError { source, .. } = &e + && matches!(source, RpcError::ConnectionError(_) | RpcError::Poisoned(_)) + { + warn!( + "Retrying after encountering error while updating table metadata: {}", + e + ); + Ok(()) + } else { + Err(e) + } + }) + } + /// Send fetch requests asynchronously without waiting for responses async fn send_fetches(&self) -> Result<()> { - // todo: check update metadata like fluss-java in case leader changes + self.check_and_update_metadata().await?; let fetch_request = self.prepare_fetch_log_requests().await; for (leader, fetch_request) in fetch_request { @@ -337,6 +377,7 @@ impl LogFetcher { let remote_log_downloader = Arc::clone(&self.remote_log_downloader); let creds_cache = self.credentials_cache.clone(); let nodes_with_pending = self.nodes_with_pending_fetch_requests.clone(); + let metadata = self.metadata.clone(); // Spawn async task to handle the fetch request // Note: These tasks are not explicitly tracked or cancelled when LogFetcher is dropped. @@ -351,27 +392,34 @@ impl LogFetcher { nodes_with_pending.lock().remove(&leader); }); - let server_node = cluster - .get_tablet_server(leader) - .expect("todo: handle leader not exist."); + let server_node = match cluster.get_tablet_server(leader) { + Some(node) => node, + None => { + warn!("No server node found for leader {}, retrying", leader); + Self::handle_fetch_failure(metadata, &leader, &fetch_request).await; + return; + } + }; let con = match conns.get_connection(server_node).await { Ok(con) => con, Err(e) => { - // todo: handle failed to get connection - warn!("Failed to get connection to destination node: {e:?}"); + warn!("Retrying after error getting connection to destination node: {e:?}"); + Self::handle_fetch_failure(metadata, &leader, &fetch_request).await; return; } }; let fetch_response = match con - .request(message::FetchLogRequest::new(fetch_request)) + .request(message::FetchLogRequest::new(fetch_request.clone())) .await { Ok(resp) => resp, Err(e) => { - // todo: handle fetch log from destination node - warn!("Failed to fetch log from destination node {server_node:?}: {e:?}"); + warn!( + "Retrying after error fetching log from destination node {server_node:?}: {e:?}" + ); + Self::handle_fetch_failure(metadata, &leader, &fetch_request).await; return; } }; @@ -387,7 +435,6 @@ impl LogFetcher { ) .await { - // todo: handle fail to handle fetch response error!("Fail to handle fetch response: {e:?}"); } }); @@ -396,6 +443,15 @@ impl LogFetcher { Ok(()) } + async fn handle_fetch_failure( + metadata: Arc, + server_id: &i32, + request: &FetchLogRequest, + ) { + let table_ids = request.tables_req.iter().map(|r| r.table_id).collect(); + metadata.invalidate_server(server_id, table_ids); + } + /// Handle fetch response and add completed fetches to buffer async fn handle_fetch_response( fetch_response: crate::proto::FetchLogResponse, diff --git a/fluss-rust/crates/fluss/src/cluster/cluster.rs b/fluss-rust/crates/fluss/src/cluster/cluster.rs index a6f20a8262..f14d055f9b 100644 --- a/fluss-rust/crates/fluss/src/cluster/cluster.rs +++ b/fluss-rust/crates/fluss/src/cluster/cluster.rs @@ -22,7 +22,7 @@ use crate::metadata::{JsonSerde, TableBucket, TableDescriptor, TableInfo, TableP use crate::proto::MetadataResponse; use crate::rpc::{from_pb_server_node, from_pb_table_path}; use rand::random_range; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; static EMPTY: Vec = Vec::new(); @@ -64,6 +64,43 @@ impl Cluster { } } + pub fn invalidate_server(&self, server_id: &i32, table_ids: Vec) -> Self { + let alive_tablet_servers_by_id = self + .alive_tablet_servers_by_id + .iter() + .filter(|&(id, _)| id != server_id) + .map(|(id, ts)| (*id, ts.clone())) + .collect(); + + let table_paths: HashSet<&TablePath> = table_ids + .iter() + .filter_map(|id| self.table_path_by_id.get(id)) + .collect(); + + let available_locations_by_path = self + .available_locations_by_path + .iter() + .filter(|&(path, _)| !table_paths.contains(path)) + .map(|(path, locations)| (path.clone(), locations.clone())) + .collect(); + + let available_locations_by_bucket = self + .available_locations_by_bucket + .iter() + .filter(|&(_bucket, location)| !table_paths.contains(&location.table_path)) + .map(|(bucket, location)| (bucket.clone(), location.clone())) + .collect(); + + Cluster::new( + self.coordinator_server.clone(), + alive_tablet_servers_by_id, + available_locations_by_path, + available_locations_by_bucket, + self.table_id_by_path.clone(), + self.table_info_by_path.clone(), + ) + } + pub fn update(&mut self, cluster: Cluster) { let Cluster { coordinator_server, @@ -214,15 +251,12 @@ impl Cluster { .unwrap_or(&EMPTY) } - pub fn get_one_available_server(&self) -> &ServerNode { - assert!( - !self.alive_tablet_servers.is_empty(), - "no alive tablet server in cluster" - ); + pub fn get_one_available_server(&self) -> Option<&ServerNode> { + if self.alive_tablet_servers.is_empty() { + return None; + } let offset = random_range(0..self.alive_tablet_servers.len()); - self.alive_tablet_servers - .get(offset) - .unwrap_or_else(|| panic!("can't find alive tab server by offset {offset}")) + self.alive_tablet_servers.get(offset) } pub fn get_bucket_count(&self, table_path: &TablePath) -> i32 { diff --git a/fluss-rust/crates/fluss/src/rpc/server_connection.rs b/fluss-rust/crates/fluss/src/rpc/server_connection.rs index fdeb56fbbe..441b175a24 100644 --- a/fluss-rust/crates/fluss/src/rpc/server_connection.rs +++ b/fluss-rust/crates/fluss/src/rpc/server_connection.rs @@ -66,13 +66,25 @@ impl RpcClient { server_node: &ServerNode, ) -> Result { let server_id = server_node.uid(); - { + let connection = { let connections = self.connections.read(); - if let Some(connection) = connections.get(server_id) { - return Ok(connection.clone()); + connections.get(server_id).cloned() + }; + + if let Some(conn) = connection { + if !conn.is_poisoned() { + return Ok(conn); } } - let new_server = self.connect(server_node).await?; + + let new_server = match self.connect(server_node).await { + Ok(new_server) => new_server, + Err(e) => { + self.connections.write().remove(server_id); + return Err(e); + } + }; + self.connections .write() .insert(server_id.clone(), new_server.clone()); @@ -231,6 +243,11 @@ where } } + fn is_poisoned(&self) -> bool { + let guard = self.state.lock(); + matches!(*guard, ConnectionState::Poison(_)) + } + pub async fn request(&self, msg: R) -> Result where R: RequestBody + Send + WriteVersionedType>, From 02ca3eb4f410017f9c72e482cc5605b59c13b41b Mon Sep 17 00:00:00 2001 From: Pavlos-Petros Tournaris Date: Thu, 25 Dec 2025 16:47:42 +0200 Subject: [PATCH 044/287] chore: add integration test for list offsets (#52) --------- Co-authored-by: luoyuxia --- .../crates/fluss/tests/integration/table.rs | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/fluss-rust/crates/fluss/tests/integration/table.rs b/fluss-rust/crates/fluss/tests/integration/table.rs index 3f7946ee9f..006adcc45b 100644 --- a/fluss-rust/crates/fluss/tests/integration/table.rs +++ b/fluss-rust/crates/fluss/tests/integration/table.rs @@ -38,8 +38,11 @@ mod table_test { use arrow::array::record_batch; use fluss::metadata::{DataTypes, Schema, TableBucket, TableDescriptor, TablePath}; use fluss::row::InternalRow; + use fluss::rpc::message::OffsetSpec; + use jiff::Timestamp; use std::sync::Arc; use std::thread; + fn before_all() { // Create a new tokio runtime in a separate thread let cluster_guard = SHARED_FLUSS_CLUSTER.clone(); @@ -256,4 +259,127 @@ mod table_test { ); } } + + #[tokio::test] + async fn list_offsets() { + let cluster = get_fluss_cluster(); + let connection = cluster.get_fluss_connection().await; + + let admin = connection.get_admin().await.expect("Failed to get admin"); + + let table_path = TablePath::new("fluss".to_string(), "test_list_offsets".to_string()); + + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("id", DataTypes::int()) + .column("name", DataTypes::string()) + .build() + .expect("Failed to build schema"), + ) + .build() + .expect("Failed to build table"); + + create_table(&admin, &table_path, &table_descriptor).await; + + // Wait for table to be fully initialized + tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; + + // Test earliest offset (should be 0 for empty table) + let earliest_offsets = admin + .list_offsets(&table_path, &[0], OffsetSpec::Earliest) + .await + .expect("Failed to list earliest offsets"); + + assert_eq!( + earliest_offsets.get(&0), + Some(&0), + "Earliest offset should be 0 for bucket 0" + ); + + // Test latest offset (should be 0 for empty table) + let latest_offsets = admin + .list_offsets(&table_path, &[0], OffsetSpec::Latest) + .await + .expect("Failed to list latest offsets"); + + assert_eq!( + latest_offsets.get(&0), + Some(&0), + "Latest offset should be 0 for empty table" + ); + + let before_append_ms = Timestamp::now().as_millisecond(); + + // Append some records + let append_writer = connection + .get_table(&table_path) + .await + .expect("Failed to get table") + .new_append() + .expect("Failed to create append") + .create_writer(); + + let batch = record_batch!( + ("id", Int32, [1, 2, 3]), + ("name", Utf8, ["alice", "bob", "charlie"]) + ) + .unwrap(); + append_writer + .append_arrow_batch(batch) + .await + .expect("Failed to append batch"); + + tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; + + let after_append_ms = Timestamp::now().as_millisecond(); + + // Test latest offset after appending (should be 3) + let latest_offsets_after = admin + .list_offsets(&table_path, &[0], OffsetSpec::Latest) + .await + .expect("Failed to list latest offsets after append"); + + assert_eq!( + latest_offsets_after.get(&0), + Some(&3), + "Latest offset should be 3 after appending 3 records" + ); + + // Test earliest offset after appending (should still be 0) + let earliest_offsets_after = admin + .list_offsets(&table_path, &[0], OffsetSpec::Earliest) + .await + .expect("Failed to list earliest offsets after append"); + + assert_eq!( + earliest_offsets_after.get(&0), + Some(&0), + "Earliest offset should still be 0" + ); + + // Test list_offsets_by_timestamp + + let timestamp_offsets = admin + .list_offsets(&table_path, &[0], OffsetSpec::Timestamp(before_append_ms)) + .await + .expect("Failed to list offsets by timestamp"); + + assert_eq!( + timestamp_offsets.get(&0), + Some(&0), + "Timestamp before append should resolve to offset 0 (start of new data)" + ); + + let timestamp_offsets = admin + .list_offsets(&table_path, &[0], OffsetSpec::Timestamp(after_append_ms)) + .await + .expect("Failed to list offsets by timestamp"); + + assert_eq!( + timestamp_offsets.get(&0), + Some(&3), + "Timestamp after append should resolve to offset 0 (no newer records)" + ); + } } From 4a225d501ce5219f7e96cc031bfbbe7561e43022 Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Mon, 29 Dec 2025 11:14:01 +0000 Subject: [PATCH 045/287] feat: Introduce HashBucketAssigner (#117) --- fluss-rust/crates/fluss/src/bucketing/mod.rs | 266 ++++++++++++++++++ .../fluss/src/client/write/bucket_assigner.rs | 54 +++- .../fluss/src/client/write/writer_client.rs | 13 +- fluss-rust/crates/fluss/src/lib.rs | 1 + .../fluss/src/metadata/data_lake_format.rs | 30 ++ fluss-rust/crates/fluss/src/metadata/mod.rs | 2 + fluss-rust/crates/fluss/src/util/mod.rs | 2 + .../crates/fluss/src/util/murmur_hash.rs | 222 +++++++++++++++ 8 files changed, 581 insertions(+), 9 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/bucketing/mod.rs create mode 100644 fluss-rust/crates/fluss/src/metadata/data_lake_format.rs create mode 100644 fluss-rust/crates/fluss/src/util/murmur_hash.rs diff --git a/fluss-rust/crates/fluss/src/bucketing/mod.rs b/fluss-rust/crates/fluss/src/bucketing/mod.rs new file mode 100644 index 0000000000..2611ac7ecf --- /dev/null +++ b/fluss-rust/crates/fluss/src/bucketing/mod.rs @@ -0,0 +1,266 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::DataLakeFormat; +use crate::util::murmur_hash; + +pub trait BucketingFunction: Sync + Send { + fn bucketing(&self, bucket_key: &[u8], num_buckets: i32) -> Result; +} + +#[allow(dead_code)] +impl dyn BucketingFunction { + /// Provides the bucketing function for a given [DataLakeFormat] + /// + /// # Arguments + /// * `lake_format` - Data lake format or none + /// + /// # Returns + /// * BucketingFunction + pub fn of(lake_format: Option<&DataLakeFormat>) -> Box { + match lake_format { + None => Box::new(FlussBucketingFunction), + Some(DataLakeFormat::Paimon) => Box::new(PaimonBucketingFunction), + Some(DataLakeFormat::Lance) => Box::new(FlussBucketingFunction), + Some(DataLakeFormat::Iceberg) => Box::new(IcebergBucketingFunction), + } + } +} + +struct FlussBucketingFunction; +impl BucketingFunction for FlussBucketingFunction { + fn bucketing(&self, bucket_key: &[u8], num_buckets: i32) -> Result { + if bucket_key.is_empty() { + return Err(IllegalArgument { + message: "bucket_key must not be empty!".to_string(), + }); + } + + if num_buckets <= 0 { + return Err(IllegalArgument { + message: "num_buckets must be positive!".to_string(), + }); + } + + let key_hash = murmur_hash::fluss_hash_bytes(bucket_key)?; + + Ok(murmur_hash::fluss_hash_i32(key_hash) % num_buckets) + } +} + +struct PaimonBucketingFunction; +impl BucketingFunction for PaimonBucketingFunction { + fn bucketing(&self, bucket_key: &[u8], num_buckets: i32) -> Result { + if bucket_key.is_empty() { + return Err(IllegalArgument { + message: "bucket_key must not be empty!".to_string(), + }); + } + + if num_buckets <= 0 { + return Err(IllegalArgument { + message: "num_buckets must be positive!".to_string(), + }); + } + + let key_hash = murmur_hash::fluss_hash_bytes(bucket_key)?; + + Ok((key_hash % num_buckets).abs()) + } +} + +struct IcebergBucketingFunction; +impl BucketingFunction for IcebergBucketingFunction { + fn bucketing(&self, bucket_key: &[u8], num_buckets: i32) -> Result { + if bucket_key.is_empty() { + return Err(IllegalArgument { + message: "bucket_key must not be empty!".to_string(), + }); + } + + if num_buckets <= 0 { + return Err(IllegalArgument { + message: "num_buckets must be positive!".to_string(), + }); + }; + + Ok((murmur_hash::hash_bytes(bucket_key) as i32 & i32::MAX) % num_buckets) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_bucketing() { + let default_bucketing = ::of(None); + + let expected = 1; + let actual = default_bucketing.bucketing(&[00u8, 10u8], 7).unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 0; + let actual = default_bucketing + .bucketing(&[00u8, 10u8, 10u8, 10u8], 12) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 6; + let actual = default_bucketing + .bucketing("2bb87d68-baf9-4e64-90f9-f80910419fa6".as_bytes(), 16) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 6; + let actual = default_bucketing + .bucketing("The quick brown fox jumps over the lazy dog".as_bytes(), 8) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + } + + #[test] + fn test_paimon_bucketing() { + let paimon_bucketing = ::of(Some(&DataLakeFormat::Paimon)); + + let expected = 1; + let actual = paimon_bucketing.bucketing(&[00u8, 10u8], 7).unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 11; + let actual = paimon_bucketing + .bucketing(&[00u8, 10u8, 10u8, 10u8], 12) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 12; + let actual = paimon_bucketing + .bucketing("2bb87d68-baf9-4e64-90f9-f80910419fa6".as_bytes(), 16) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 0; + let actual = paimon_bucketing + .bucketing("The quick brown fox jumps over the lazy dog".as_bytes(), 8) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + } + + #[test] + fn test_lance_bucketing() { + let lance_bucketing = ::of(Some(&DataLakeFormat::Lance)); + + let expected = 1; + let actual = lance_bucketing.bucketing(&[00u8, 10u8], 7).unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 0; + let actual = lance_bucketing + .bucketing(&[00u8, 10u8, 10u8, 10u8], 12) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 6; + let actual = lance_bucketing + .bucketing("2bb87d68-baf9-4e64-90f9-f80910419fa6".as_bytes(), 16) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 6; + let actual = lance_bucketing + .bucketing("The quick brown fox jumps over the lazy dog".as_bytes(), 8) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + } + + #[test] + fn test_iceberg_bucketing() { + let iceberg_bucketing = ::of(Some(&DataLakeFormat::Iceberg)); + + let expected = 3; + let actual = iceberg_bucketing.bucketing(&[00u8, 10u8], 7).unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 4; + let actual = iceberg_bucketing + .bucketing(&[00u8, 10u8, 10u8, 10u8], 12) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 12; + let actual = iceberg_bucketing + .bucketing("2bb87d68-baf9-4e64-90f9-f80910419fa6".as_bytes(), 16) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 3; + let actual = iceberg_bucketing + .bucketing("The quick brown fox jumps over the lazy dog".as_bytes(), 8) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs b/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs index 991c5f9197..44b2673697 100644 --- a/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs +++ b/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. +use crate::bucketing::BucketingFunction; use crate::cluster::Cluster; +use crate::error::Error::IllegalArgument; +use crate::error::Result; use crate::metadata::TablePath; use rand::Rng; use std::sync::atomic::{AtomicI32, Ordering}; @@ -25,7 +28,7 @@ pub trait BucketAssigner: Sync + Send { fn on_new_batch(&self, cluster: &Cluster, prev_bucket_id: i32); - fn assign_bucket(&self, bucket_key: Option<&[u8]>, cluster: &Cluster) -> i32; + fn assign_bucket(&self, bucket_key: Option<&[u8]>, cluster: &Cluster) -> Result; } #[derive(Debug)] @@ -91,12 +94,55 @@ impl BucketAssigner for StickyBucketAssigner { self.next_bucket(cluster, prev_bucket_id); } - fn assign_bucket(&self, _bucket_key: Option<&[u8]>, cluster: &Cluster) -> i32 { + fn assign_bucket(&self, _bucket_key: Option<&[u8]>, cluster: &Cluster) -> Result { let bucket_id = self.current_bucket_id.load(Ordering::Relaxed); if bucket_id < 0 { - self.next_bucket(cluster, bucket_id) + Ok(self.next_bucket(cluster, bucket_id)) } else { - bucket_id + Ok(bucket_id) } } } + +/// A [BucketAssigner] which assigns based on a modulo hashing function +pub struct HashBucketAssigner { + num_buckets: i32, + bucketing_function: Box, +} + +#[allow(dead_code)] +impl HashBucketAssigner { + /// Creates a new [HashBucketAssigner] based on the given [BucketingFunction]. + /// See [BucketingFunction.of(Option<&DataLakeFormat>)] for bucketing functions. + /// + /// + /// # Arguments + /// * `num_buckets` - The number of buckets + /// * `bucketing_function` - The bucketing function + /// + /// # Returns + /// * [HashBucketAssigner] - The hash bucket assigner + pub fn new(num_buckets: i32, bucketing_function: Box) -> Self { + HashBucketAssigner { + num_buckets, + bucketing_function, + } + } +} + +impl BucketAssigner for HashBucketAssigner { + fn abort_if_batch_full(&self) -> bool { + false + } + + fn on_new_batch(&self, _: &Cluster, _: i32) { + // do nothing + } + + fn assign_bucket(&self, bucket_key: Option<&[u8]>, _: &Cluster) -> Result { + let key = bucket_key.ok_or_else(|| IllegalArgument { + message: "no bucket key provided".to_string(), + })?; + self.bucketing_function.bucketing(key, self.num_buckets) + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/writer_client.rs b/fluss-rust/crates/fluss/src/client/write/writer_client.rs index 042859afb4..22e0397212 100644 --- a/fluss-rust/crates/fluss/src/client/write/writer_client.rs +++ b/fluss-rust/crates/fluss/src/client/write/writer_client.rs @@ -91,7 +91,7 @@ impl WriterClient { let table_path = &record.table_path; let cluster = self.metadata.get_cluster(); - let (bucket_assigner, bucket_id) = self.assign_bucket(table_path); + let (bucket_assigner, bucket_id) = self.assign_bucket(table_path)?; let mut result = self .accumulate @@ -101,7 +101,7 @@ impl WriterClient { if result.abort_record_for_new_batch { let prev_bucket_id = bucket_id; bucket_assigner.on_new_batch(&cluster, prev_bucket_id); - let bucket_id = bucket_assigner.assign_bucket(None, &cluster); + let bucket_id = bucket_assigner.assign_bucket(None, &cluster)?; result = self .accumulate .append(record, bucket_id, &cluster, false) @@ -114,7 +114,10 @@ impl WriterClient { Ok(result.result_handle.expect("result_handle should exist")) } - fn assign_bucket(&self, table_path: &Arc) -> (Arc>, i32) { + fn assign_bucket( + &self, + table_path: &Arc, + ) -> Result<(Arc>, i32)> { let cluster = self.metadata.get_cluster(); let bucket_assigner = { if let Some(assigner) = self.bucket_assigners.get(table_path) { @@ -126,8 +129,8 @@ impl WriterClient { assigner } }; - let bucket_id = bucket_assigner.assign_bucket(None, &cluster); - (bucket_assigner, bucket_id) + let bucket_id = bucket_assigner.assign_bucket(None, &cluster)?; + Ok((bucket_assigner, bucket_id)) } pub async fn close(self) -> Result<()> { diff --git a/fluss-rust/crates/fluss/src/lib.rs b/fluss-rust/crates/fluss/src/lib.rs index 25978ce0bb..1bd72a4aac 100644 --- a/fluss-rust/crates/fluss/src/lib.rs +++ b/fluss-rust/crates/fluss/src/lib.rs @@ -26,6 +26,7 @@ mod cluster; pub mod config; pub mod error; +mod bucketing; mod compression; pub mod io; mod util; diff --git a/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs b/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs new file mode 100644 index 0000000000..76a23f8d96 --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Identifies the logical format of a data lake table supported by Fluss. +/// +/// This enum is typically used in metadata and configuration to distinguish +/// between different table formats so that the appropriate integration and +/// semantics can be applied. +pub enum DataLakeFormat { + /// Apache Paimon data lake table format. + Paimon, + /// Lance columnar data format / lakehouse table format. + Lance, + /// Apache Iceberg data lake table format. + Iceberg, +} diff --git a/fluss-rust/crates/fluss/src/metadata/mod.rs b/fluss-rust/crates/fluss/src/metadata/mod.rs index 87540071f3..9c0b1b472c 100644 --- a/fluss-rust/crates/fluss/src/metadata/mod.rs +++ b/fluss-rust/crates/fluss/src/metadata/mod.rs @@ -15,11 +15,13 @@ // specific language governing permissions and limitations // under the License. +mod data_lake_format; mod database; mod datatype; mod json_serde; mod table; +pub use data_lake_format::*; pub use database::*; pub use datatype::*; pub use json_serde::*; diff --git a/fluss-rust/crates/fluss/src/util/mod.rs b/fluss-rust/crates/fluss/src/util/mod.rs index d8c0db59d8..5f67290e43 100644 --- a/fluss-rust/crates/fluss/src/util/mod.rs +++ b/fluss-rust/crates/fluss/src/util/mod.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +pub mod murmur_hash; + use crate::metadata::TableBucket; use linked_hash_map::LinkedHashMap; use std::collections::{HashMap, HashSet}; diff --git a/fluss-rust/crates/fluss/src/util/murmur_hash.rs b/fluss-rust/crates/fluss/src/util/murmur_hash.rs new file mode 100644 index 0000000000..12229c717d --- /dev/null +++ b/fluss-rust/crates/fluss/src/util/murmur_hash.rs @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/* This file is based on source code of Apache Flink Project (https://flink.apache.org/), licensed by the Apache + * Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for + * additional information regarding copyright ownership. */ +use crate::error::Error::IllegalArgument; +use crate::error::Result; + +pub const MURMUR3_DEFAULT_SEED: u32 = 0; +pub const FLINK_MURMUR3_DEFAULT_SEED: i32 = 42; + +const C1: u32 = 0xCC9E_2D51; +const C2: u32 = 0x1B87_3593; +const R1: u32 = 15; +const R2: u32 = 13; +const M: u32 = 5; +const N: u32 = 0xE654_6B64; +const CHUNK_SIZE: usize = 4; + +/// Hashes the data using 32-bit Murmur3 hash with 0 as seed +/// +/// # Arguments +/// * `data` - byte array containing data to be hashed +/// +/// # Returns +/// Returns hash value +pub fn hash_bytes(data: &[u8]) -> u32 { + hash_bytes_with_seed(data, MURMUR3_DEFAULT_SEED) +} + +#[inline(always)] +fn hash_bytes_with_seed(data: &[u8], seed: u32) -> u32 { + let length = data.len(); + let chunks = length / CHUNK_SIZE; + let length_aligned = chunks * CHUNK_SIZE; + + let mut h1 = hash_full_chunks(data, seed); + let mut k1 = 0u32; + + for (shift, &b) in data[length_aligned..].iter().enumerate() { + k1 |= (b as u32) << (8 * shift); + } + + h1 ^= k1.wrapping_mul(C1).rotate_left(R1).wrapping_mul(C2); + + fmix(h1, length) +} + +/// Hashes the data using Fluss'/Flink's variant of 32-bit Murmur hash with 42 as seed and tail bytes mixed into hash byte-by-byte +/// Maximum data array size supported is 2GB +/// +/// # Arguments +/// * `data` - byte array containing data to be hashed +/// +/// # Returns +/// * result of hashing, `Ok(hash_value)` +/// +/// # Error +/// Returns `Err(IllegalArgument)` if byte array is larger than 2GB +pub fn fluss_hash_bytes(data: &[u8]) -> Result { + fluss_hash_bytes_with_seed(data, FLINK_MURMUR3_DEFAULT_SEED) +} +#[inline(always)] +fn fluss_hash_bytes_with_seed(data: &[u8], seed: i32) -> Result { + let length = data.len(); + + if length >= i32::MAX as usize { + return Err(IllegalArgument { + message: "data array size {length} is bigger than supported".to_string(), + }); + } + + let chunks = length / CHUNK_SIZE; + let length_aligned = chunks * CHUNK_SIZE; + + let mut h1 = hash_full_chunks(data, seed as u32); + + for byte in data.iter().take(length).skip(length_aligned) { + let k1 = mix_k1(*byte as u32); + h1 = mix_h1(h1, k1); + } + + Ok(fmix(h1, length) as i32) +} + +#[inline(always)] +fn hash_full_chunks(data: &[u8], seed: u32) -> u32 { + data.chunks_exact(CHUNK_SIZE).fold(seed, |h1, chunk| { + let block = u32::from_le_bytes(chunk.try_into().unwrap()); + let k1 = mix_k1(block); + mix_h1(h1, k1) + }) +} + +#[inline(always)] +fn mix_k1(k1: u32) -> u32 { + k1.wrapping_mul(C1).rotate_left(R1).wrapping_mul(C2) +} + +#[inline(always)] +fn mix_h1(h1: u32, k1: u32) -> u32 { + (h1 ^ k1).rotate_left(R2).wrapping_mul(M).wrapping_add(N) +} + +// Finalization mix - force all bits of a hash block to avalanche +#[inline(always)] +fn fmix(mut h1: u32, length: usize) -> u32 { + h1 ^= length as u32; + bit_mix(h1) +} + +/// Hashes an i32 using Fluss'/Flink's variant of Murmur +/// +/// # Arguments +/// * `input` - i32 value to be hashed +/// +/// # Returns +/// Returns hash value +pub fn fluss_hash_i32(input: i32) -> i32 { + let mut input = input as u32; + input = input.wrapping_mul(C1); + input = input.rotate_left(R1); + input = input.wrapping_mul(C2); + input = input.rotate_left(R2); + + input = input.wrapping_mul(M).wrapping_add(N); + input ^= CHUNK_SIZE as u32; + let output = bit_mix(input) as i32; + + if output >= 0 { + output + } else if output != i32::MIN { + -output + } else { + 0 + } +} + +const BIT_MIX_A: u32 = 0x85EB_CA6B; +const BIT_MIX_B: u32 = 0xC2B2_AE35; + +#[inline(always)] +fn bit_mix(mut input: u32) -> u32 { + input = input ^ (input >> 16); + input = input.wrapping_mul(BIT_MIX_A); + input = input ^ (input >> 13); + input = input.wrapping_mul(BIT_MIX_B); + input = input ^ (input >> 16); + input +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_murmur3() { + // + let empty_data_hash = hash_bytes(&[]); + assert_eq!(empty_data_hash, 0); + + let empty_data_hash = hash_bytes_with_seed(&[], 1); + assert_eq!(0x514E_28B7, empty_data_hash); + + let empty_data_hash = hash_bytes_with_seed(&[], 0xFFFF_FFFF); + assert_eq!(0x81F1_6F39, empty_data_hash); + + let hash = hash_bytes("The quick brown fox jumps over the lazy dog".as_bytes()); + assert_eq!(0x2E4F_F723, hash); + + let hash = hash_bytes_with_seed( + "The quick brown fox jumps over the lazy dog".as_bytes(), + 0x9747_B28C, + ); + assert_eq!(0x2FA8_26CD, hash); + } + + #[test] + fn test_flink_murmur() { + let empty_data_hash = fluss_hash_bytes_with_seed(&[], 0).expect("Failed to hash"); + assert_eq!(empty_data_hash, 0); + + let empty_data_hash = fluss_hash_bytes(&[]).expect("Failed to hash"); + assert_eq!(0x087F_CD5C, empty_data_hash); + + let empty_data_hash = + fluss_hash_bytes_with_seed(&[], 0xFFFF_FFFFu32 as i32).expect("Failed to hash"); + assert_eq!(0x81F1_6F39u32 as i32, empty_data_hash); + + let hash = + fluss_hash_bytes_with_seed("The quick brown fox jumps over the lazy dog".as_bytes(), 0) + .expect("Failed to hash"); + assert_eq!(0x5FD2_0A20, hash); + + let hash = fluss_hash_bytes("The quick brown fox jumps over the lazy dog".as_bytes()) + .expect("Failed to hash"); + assert_eq!(0x1BC6_F880, hash); + + let hash = fluss_hash_i32(0); + assert_eq!(0x2362_F9DE, hash); + + let hash = fluss_hash_i32(42); + assert_eq!(0x43A4_6E1D, hash); + + let hash = fluss_hash_i32(-77); + assert_eq!(0x2EEB_27DE, hash); + } +} From eea8da1fd2fca46435f30d4f741cdd7b58c40a61 Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Fri, 2 Jan 2026 08:35:43 +0000 Subject: [PATCH 046/287] chore: rust client to return error when currently unimplemented non-default ZSTD compression is configured (#120) --------- Co-authored-by: luoyuxia --- .../crates/fluss/src/client/table/scanner.rs | 7 +--- .../src/compression/arrow_compression.rs | 41 ++++++++++++++----- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index 11bdfa3148..bf3983917d 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -344,10 +344,7 @@ impl LogFetcher { if let Error::RpcError { source, .. } = &e && matches!(source, RpcError::ConnectionError(_) | RpcError::Poisoned(_)) { - warn!( - "Retrying after encountering error while updating table metadata: {}", - e - ); + warn!("Retrying after encountering error while updating table metadata: {e}"); Ok(()) } else { Err(e) @@ -395,7 +392,7 @@ impl LogFetcher { let server_node = match cluster.get_tablet_server(leader) { Some(node) => node, None => { - warn!("No server node found for leader {}, retrying", leader); + warn!("No server node found for leader {leader}, retrying"); Self::handle_fetch_failure(metadata, &leader, &fetch_request).await; return; } diff --git a/fluss-rust/crates/fluss/src/compression/arrow_compression.rs b/fluss-rust/crates/fluss/src/compression/arrow_compression.rs index 32dfadb48c..8121a512b1 100644 --- a/fluss-rust/crates/fluss/src/compression/arrow_compression.rs +++ b/fluss-rust/crates/fluss/src/compression/arrow_compression.rs @@ -17,6 +17,7 @@ use crate::error::{Error, Result}; use arrow::ipc::CompressionType; +use arrow_schema::ArrowError; use std::collections::HashMap; pub const TABLE_LOG_ARROW_COMPRESSION_ZSTD_LEVEL: &str = "table.log.arrow.compression.zstd.level"; @@ -71,21 +72,31 @@ impl ArrowCompressionInfo { { Some(Ok(level)) if !(1..=22).contains(&level) => Err(Error::IllegalArgument { message: format!( - "Invalid ZSTD compression level: {}. Expected a value between 1 and 22.", - level + "Invalid ZSTD compression level: {level}. Expected a value between 1 and 22." ), }), Some(Err(e)) => Err(Error::IllegalArgument { message: format!( - "Invalid ZSTD compression level. Expected a value between 1 and 22. {}", - e + "Invalid ZSTD compression level. Expected a value between 1 and 22. {e}" ), }), - - Some(Ok(level)) => Ok(Self { - compression_type, - compression_level: level, - }), + Some(Ok(level)) => { + // TODO Remove once non-default ZSTD compression level is implemented https://github.com/apache/fluss-rust/issues/109 + if level != DEFAULT_ZSTD_COMPRESSION_LEVEL { + return Err(Error::ArrowError { + message: format!( + "Rust client currently only implements default ZSTD compression level {DEFAULT_ZSTD_COMPRESSION_LEVEL}. Got: {level}." + ), + source: ArrowError::NotYetImplemented(format!( + "zstd compression level {level}." + )), + }); + } + Ok(Self { + compression_type, + compression_level: level, + }) + } None => Ok(Self { compression_type, compression_level: DEFAULT_ZSTD_COMPRESSION_LEVEL, @@ -171,11 +182,19 @@ mod tests { "ZSTD", )])); assert_eq!(compression_info.unwrap().compression_level, 3); - let compression_info = ArrowCompressionInfo::from_conf(&mk_map(&[ + } + + // TODO Remove once non-default ZSTD compression level is implemented https://github.com/apache/fluss-rust/issues/109 + #[test] + fn test_from_conf_zstd_compression_level_error_when_non_default() { + let result = ArrowCompressionInfo::from_conf(&mk_map(&[ ("table.log.arrow.compression.type", "ZSTD"), ("table.log.arrow.compression.zstd.level", "1"), ])); - assert_eq!(compression_info.unwrap().compression_level, 1); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains( + "Rust client currently only implements default ZSTD compression level 3. Got: 1." + )); } #[test] From 3d8483ef53338a9c67fa7f157990d3c283d5e7c7 Mon Sep 17 00:00:00 2001 From: Andrea Bozzo Date: Fri, 2 Jan 2026 09:54:09 +0100 Subject: [PATCH 047/287] chore: add integration tests for subscribe_batch and project_by_name (#116) --------- Co-authored-by: luoyuxia --- fluss-rust/bindings/cpp/src/lib.rs | 2 +- .../crates/fluss/src/client/table/scanner.rs | 6 +- .../crates/fluss/tests/integration/table.rs | 243 ++++++++++++------ 3 files changed, 169 insertions(+), 82 deletions(-) diff --git a/fluss-rust/bindings/cpp/src/lib.rs b/fluss-rust/bindings/cpp/src/lib.rs index cd1803b888..2d37763673 100644 --- a/fluss-rust/bindings/cpp/src/lib.rs +++ b/fluss-rust/bindings/cpp/src/lib.rs @@ -625,7 +625,7 @@ impl LogScanner { bucket_offsets.insert(sub.bucket_id, sub.offset); } - let result = RUNTIME.block_on(async { self.inner.subscribe_batch(bucket_offsets).await }); + let result = RUNTIME.block_on(async { self.inner.subscribe_batch(&bucket_offsets).await }); match result { Ok(_) => ok_result(), diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index bf3983917d..0acaac890f 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -223,7 +223,7 @@ impl LogScanner { Ok(()) } - pub async fn subscribe_batch(&self, bucket_offsets: HashMap) -> Result<()> { + pub async fn subscribe_batch(&self, bucket_offsets: &HashMap) -> Result<()> { self.metadata .check_and_update_table_metadata(from_ref(&self.table_path)) .await?; @@ -236,8 +236,8 @@ impl LogScanner { let mut scan_bucket_offsets = HashMap::new(); for (bucket_id, offset) in bucket_offsets { - let table_bucket = TableBucket::new(self.table_id, bucket_id); - scan_bucket_offsets.insert(table_bucket, offset); + let table_bucket = TableBucket::new(self.table_id, *bucket_id); + scan_bucket_offsets.insert(table_bucket, *offset); } self.log_scanner_status diff --git a/fluss-rust/crates/fluss/tests/integration/table.rs b/fluss-rust/crates/fluss/tests/integration/table.rs index 006adcc45b..0ac34c7635 100644 --- a/fluss-rust/crates/fluss/tests/integration/table.rs +++ b/fluss-rust/crates/fluss/tests/integration/table.rs @@ -36,12 +36,16 @@ mod table_test { use crate::integration::fluss_cluster::{FlussTestingCluster, FlussTestingClusterBuilder}; use crate::integration::utils::create_table; use arrow::array::record_batch; + use fluss::client::{FlussTable, TableScan}; use fluss::metadata::{DataTypes, Schema, TableBucket, TableDescriptor, TablePath}; + use fluss::record::ScanRecord; use fluss::row::InternalRow; use fluss::rpc::message::OffsetSpec; use jiff::Timestamp; + use std::collections::HashMap; use std::sync::Arc; use std::thread; + use std::time::Duration; fn before_all() { // Create a new tokio runtime in a separate thread @@ -137,6 +141,11 @@ mod table_test { append_writer.flush().await.expect("Failed to flush"); + // Create scanner to verify appended records + let table = connection + .get_table(&table_path) + .await + .expect("Failed to get table"); let num_buckets = table.table_info().get_num_buckets(); let log_scanner = table .new_scan() @@ -149,84 +158,6 @@ mod table_test { .expect("Failed to subscribe"); } - let scan_records = log_scanner - .poll(std::time::Duration::from_secs(60)) - .await - .expect("Failed to poll"); - - let mut records: Vec<_> = scan_records.into_iter().collect(); - records.sort_by_key(|r| r.offset()); - - assert_eq!(records.len(), 6, "Should have 6 records"); - for (i, record) in records.iter().enumerate() { - let row = record.row(); - let expected_c1 = (i + 1) as i32; - let expected_c2 = format!("a{}", i + 1); - assert_eq!(row.get_int(0), expected_c1, "c1 mismatch at index {}", i); - assert_eq!(row.get_string(1), expected_c2, "c2 mismatch at index {}", i); - } - - let log_scanner_projected = table - .new_scan() - .project(&[1, 0]) - .expect("Failed to project") - .create_log_scanner() - .expect("Failed to create log scanner"); - for bucket_id in 0..num_buckets { - log_scanner_projected - .subscribe(bucket_id, 0) - .await - .expect("Failed to subscribe"); - } - - let scan_records_projected = log_scanner_projected - .poll(std::time::Duration::from_secs(10)) - .await - .expect("Failed to poll"); - - let mut records_projected: Vec<_> = scan_records_projected.into_iter().collect(); - records_projected.sort_by_key(|r| r.offset()); - - assert_eq!( - records_projected.len(), - 6, - "Should have 6 records with projection" - ); - for (i, record) in records_projected.iter().enumerate() { - let row = record.row(); - let expected_c2 = format!("a{}", i + 1); - let expected_c1 = (i + 1) as i32; - assert_eq!( - row.get_string(0), - expected_c2, - "Projected c2 (first column) mismatch at index {}", - i - ); - assert_eq!( - row.get_int(1), - expected_c1, - "Projected c1 (second column) mismatch at index {}", - i - ); - } - - // Create scanner to verify appended records - let table = connection - .get_table(&table_path) - .await - .expect("Failed to get table"); - - let table_scan = table.new_scan(); - let log_scanner = table_scan - .create_log_scanner() - .expect("Failed to create log scanner"); - - // Subscribe to bucket 0 starting from offset 0 - log_scanner - .subscribe(0, 0) - .await - .expect("Failed to subscribe to bucket"); - // Poll for records let scan_records = log_scanner .poll(tokio::time::Duration::from_secs(10)) @@ -382,4 +313,160 @@ mod table_test { "Timestamp after append should resolve to offset 0 (no newer records)" ); } + + #[tokio::test] + async fn test_project() { + let cluster = get_fluss_cluster(); + let connection = cluster.get_fluss_connection().await; + + let admin = connection.get_admin().await.expect("Failed to get admin"); + + let table_path = TablePath::new("fluss".to_string(), "test_project".to_string()); + + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("col_a", DataTypes::int()) + .column("col_b", DataTypes::string()) + .column("col_c", DataTypes::int()) + .build() + .expect("Failed to build schema"), + ) + .build() + .expect("Failed to build table"); + + create_table(&admin, &table_path, &table_descriptor).await; + + let table = connection + .get_table(&table_path) + .await + .expect("Failed to get table"); + + // Append 3 records + let append_writer = table + .new_append() + .expect("Failed to create append") + .create_writer(); + + let batch = record_batch!( + ("col_a", Int32, [1, 2, 3]), + ("col_b", Utf8, ["x", "y", "z"]), + ("col_c", Int32, [10, 20, 30]) + ) + .unwrap(); + append_writer + .append_arrow_batch(batch) + .await + .expect("Failed to append batch"); + append_writer.flush().await.expect("Failed to flush"); + + // Test project_by_name: select col_b and col_c only + let records = scan_table(&table, |scan| { + scan.project_by_name(&["col_b", "col_c"]) + .expect("Failed to project by name") + }) + .await; + + assert_eq!( + records.len(), + 3, + "Should have 3 records with project_by_name" + ); + + // Verify projected columns are in the correct order (col_b, col_c) + let expected_col_b = ["x", "y", "z"]; + let expected_col_c = [10, 20, 30]; + + for (i, record) in records.iter().enumerate() { + let row = record.row(); + // col_b is now at index 0, col_c is at index 1 + assert_eq!( + row.get_string(0), + expected_col_b[i], + "col_b mismatch at index {}", + i + ); + assert_eq!( + row.get_int(1), + expected_col_c[i], + "col_c mismatch at index {}", + i + ); + } + + // test project by column indices + let records = scan_table(&table, |scan| { + scan.project(&[1, 0]).expect("Failed to project by indices") + }) + .await; + + assert_eq!( + records.len(), + 3, + "Should have 3 records with project_by_name" + ); + // Verify projected columns are in the correct order (col_b, col_a) + let expected_col_b = ["x", "y", "z"]; + let expected_col_a = [1, 2, 3]; + + for (i, record) in records.iter().enumerate() { + let row = record.row(); + // col_b is now at index 0, col_c is at index 1 + assert_eq!( + row.get_string(0), + expected_col_b[i], + "col_b mismatch at index {}", + i + ); + assert_eq!( + row.get_int(1), + expected_col_a[i], + "col_c mismatch at index {}", + i + ); + } + + // Test error case: empty column names should fail + let result = table.new_scan().project_by_name(&[]); + assert!( + result.is_err(), + "project_by_name with empty names should fail" + ); + + // Test error case: non-existent column should fail + let result = table.new_scan().project_by_name(&["nonexistent_column"]); + assert!( + result.is_err(), + "project_by_name with non-existent column should fail" + ); + } + + async fn scan_table<'a>( + table: &FlussTable<'a>, + setup_scan: impl FnOnce(TableScan) -> TableScan, + ) -> Vec { + // 1. build log scanner + let log_scanner = setup_scan(table.new_scan()) + .create_log_scanner() + .expect("Failed to create log scanner"); + + // 2. subscribe + let mut bucket_offsets = HashMap::new(); + bucket_offsets.insert(0, 0); + log_scanner + .subscribe_batch(&bucket_offsets) + .await + .expect("Failed to subscribe"); + + // 3. poll records + let scan_records = log_scanner + .poll(Duration::from_secs(10)) + .await + .expect("Failed to poll"); + + // 4. collect and sort + let mut records: Vec<_> = scan_records.into_iter().collect(); + records.sort_by_key(|r| r.offset()); + records + } } From 2fc00e2c13fad60b9efeb9fb7c6fa2317bdbde47 Mon Sep 17 00:00:00 2001 From: AlexZhao Date: Fri, 2 Jan 2026 19:37:56 +0800 Subject: [PATCH 048/287] feat: support bazel build for cpp bindings (#107) --- fluss-rust/bindings/cpp/.bazelrc | 37 +++ fluss-rust/bindings/cpp/.gitignore | 9 + fluss-rust/bindings/cpp/BUILD.bazel | 342 +++++++++++++++++++++++++++ fluss-rust/bindings/cpp/MODULE.bazel | 23 ++ fluss-rust/bindings/cpp/ci.sh | 100 ++++++++ 5 files changed, 511 insertions(+) create mode 100644 fluss-rust/bindings/cpp/.bazelrc create mode 100644 fluss-rust/bindings/cpp/BUILD.bazel create mode 100644 fluss-rust/bindings/cpp/MODULE.bazel create mode 100755 fluss-rust/bindings/cpp/ci.sh diff --git a/fluss-rust/bindings/cpp/.bazelrc b/fluss-rust/bindings/cpp/.bazelrc new file mode 100644 index 0000000000..ce7d81f82a --- /dev/null +++ b/fluss-rust/bindings/cpp/.bazelrc @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Bazel configuration for fluss-rust C++ bindings + +# Enable BzlMod +common --enable_bzlmod + +# Debug configuration (matches BUILD.bazel settings) +build:debug --compilation_mode=dbg +build:debug --copt=-g3 +build:debug --copt=-ggdb +build:debug --copt=-O0 +build:debug --copt=-fno-omit-frame-pointer +build:debug --copt=-DDEBUG +build:debug --strip=never +build:debug --linkopt=-g + +# Release configuration +build:release --compilation_mode=opt +build:release --copt=-O2 +build:release --copt=-DNDEBUG +build:release --strip=always diff --git a/fluss-rust/bindings/cpp/.gitignore b/fluss-rust/bindings/cpp/.gitignore index 6836e70c06..43f761c951 100644 --- a/fluss-rust/bindings/cpp/.gitignore +++ b/fluss-rust/bindings/cpp/.gitignore @@ -5,3 +5,12 @@ cmake-build-*/ *.a *.so *.dylib + +# Bazel build outputs +bazel-build/ +bazel-bin +bazel-out +bazel-testlogs +bazel-cpp +bazel-* +MODULE.bazel.lock diff --git a/fluss-rust/bindings/cpp/BUILD.bazel b/fluss-rust/bindings/cpp/BUILD.bazel new file mode 100644 index 0000000000..81d483cb72 --- /dev/null +++ b/fluss-rust/bindings/cpp/BUILD.bazel @@ -0,0 +1,342 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +licenses(["notice"]) + +load("@rules_cc//cc:defs.bzl", "cc_library", "cc_binary") + +config_setting( + name = "debug_mode", + values = {"compilation_mode": "dbg"}, +) + +config_setting( + name = "fastbuild_mode", + values = {"compilation_mode": "fastbuild"}, +) + +config_setting( + name = "release_mode", + values = {"compilation_mode": "opt"}, +) + +genrule( + name = "cargo_build_debug", + srcs = glob([ + "src/**/*.rs", + "Cargo.toml", + ]), + outs = [ + "rust_lib_debug.a", + "rust_bridge_cc_debug.cc", + "rust_bridge_h_debug.h", + "src/lib.rs_debug.h", + "cxxbridge/rust/cxx_debug.h", + ], + cmd = """ + set -e + EXECROOT=$$(pwd) + OUTPUT_LIB=$(location rust_lib_debug.a) + OUTPUT_CC=$(location rust_bridge_cc_debug.cc) + OUTPUT_H=$(location rust_bridge_h_debug.h) + OUTPUT_SRC_H=$(location src/lib.rs_debug.h) + OUTPUT_CXX_H=$(location cxxbridge/rust/cxx_debug.h) + # Resolve real source path from sandbox symlink + SANDBOX_CARGO=$(location Cargo.toml) + REAL_CARGO=$$(readlink -f $$SANDBOX_CARGO 2>/dev/null || python3 -c "import os; print(os.path.realpath('$$SANDBOX_CARGO'))") + CARGO_DIR=$$(dirname $$REAL_CARGO) + # Find Cargo workspace root (fluss-rust directory, 2 levels up from bindings/cpp) + WORKSPACE_ROOT=$$(cd $$CARGO_DIR/../.. && pwd) + if [ ! -f $$WORKSPACE_ROOT/Cargo.toml ]; then + echo "Error: Cannot find workspace root Cargo.toml at $$WORKSPACE_ROOT" >&2 + exit 1 + fi + cd $$WORKSPACE_ROOT + cargo build --manifest-path $$CARGO_DIR/Cargo.toml + CARGO_TARGET_DIR=$$WORKSPACE_ROOT/target + # cxxbridge uses the Cargo package name (with hyphen): fluss-cpp + RUST_BRIDGE_DIR=$$CARGO_TARGET_DIR/cxxbridge/fluss-cpp/src + # Cargo converts hyphens to underscores in library file names: libfluss_cpp.a + RUST_LIB=$$CARGO_TARGET_DIR/debug/libfluss_cpp.a + if [ ! -f $$RUST_LIB ]; then + echo "Error: Rust library not found at $$RUST_LIB" >&2 + exit 1 + fi + if [ ! -f $$RUST_BRIDGE_DIR/lib.rs.cc ]; then + echo "Error: cxxbridge CC file not found at $$RUST_BRIDGE_DIR/lib.rs.cc" >&2 + exit 1 + fi + if [ ! -f $$RUST_BRIDGE_DIR/lib.rs.h ]; then + echo "Error: cxxbridge header file not found at $$RUST_BRIDGE_DIR/lib.rs.h" >&2 + exit 1 + fi + cd $$EXECROOT + mkdir -p $$(dirname $$OUTPUT_SRC_H) $$(dirname $$OUTPUT_CXX_H) + cp $$RUST_LIB $$OUTPUT_LIB || (echo "Failed to copy $$RUST_LIB to $$OUTPUT_LIB" >&2; exit 1) + cp $$RUST_BRIDGE_DIR/lib.rs.cc $$OUTPUT_CC || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.cc to $$OUTPUT_CC" >&2; exit 1) + cp $$RUST_BRIDGE_DIR/lib.rs.h $$OUTPUT_H || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.h to $$OUTPUT_H" >&2; exit 1) + cp $$RUST_BRIDGE_DIR/lib.rs.h $$OUTPUT_SRC_H || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.h to $$OUTPUT_SRC_H" >&2; exit 1) + CXX_H_SOURCE=$$CARGO_TARGET_DIR/cxxbridge/rust/cxx.h + if [ ! -f $$CXX_H_SOURCE ] && [ ! -L $$CXX_H_SOURCE ]; then + echo "Error: cxx.h not found at $$CXX_H_SOURCE" >&2 + exit 1 + fi + cp -L $$CXX_H_SOURCE $$OUTPUT_CXX_H || (echo "Failed to copy $$CXX_H_SOURCE to $$OUTPUT_CXX_H" >&2; exit 1) + """, + message = "Building Rust library (debug) with cargo...", + local = 1, +) + +genrule( + name = "cargo_build_release", + srcs = glob([ + "src/**/*.rs", + "Cargo.toml", + ]), + outs = [ + "rust_lib_release.a", + "rust_bridge_cc_release.cc", + "rust_bridge_h_release.h", + "src/lib.rs_release.h", + "cxxbridge/rust/cxx_release.h", + ], + cmd = """ + set -e + EXECROOT=$$(pwd) + OUTPUT_LIB=$(location rust_lib_release.a) + OUTPUT_CC=$(location rust_bridge_cc_release.cc) + OUTPUT_H=$(location rust_bridge_h_release.h) + OUTPUT_SRC_H=$(location src/lib.rs_release.h) + OUTPUT_CXX_H=$(location cxxbridge/rust/cxx_release.h) + # Resolve real source path from sandbox symlink + SANDBOX_CARGO=$(location Cargo.toml) + REAL_CARGO=$$(readlink -f $$SANDBOX_CARGO 2>/dev/null || python3 -c "import os; print(os.path.realpath('$$SANDBOX_CARGO'))") + CARGO_DIR=$$(dirname $$REAL_CARGO) + # Find Cargo workspace root (fluss-rust directory, 2 levels up from bindings/cpp) + WORKSPACE_ROOT=$$(cd $$CARGO_DIR/../.. && pwd) + if [ ! -f $$WORKSPACE_ROOT/Cargo.toml ]; then + echo "Error: Cannot find workspace root Cargo.toml at $$WORKSPACE_ROOT" >&2 + exit 1 + fi + cd $$WORKSPACE_ROOT + cargo build --release --manifest-path $$CARGO_DIR/Cargo.toml + CARGO_TARGET_DIR=$$WORKSPACE_ROOT/target + # cxxbridge uses the Cargo package name (with hyphen): fluss-cpp + RUST_BRIDGE_DIR=$$CARGO_TARGET_DIR/cxxbridge/fluss-cpp/src + # Cargo converts hyphens to underscores in library file names: libfluss_cpp.a + RUST_LIB=$$CARGO_TARGET_DIR/release/libfluss_cpp.a + if [ ! -f $$RUST_LIB ]; then + echo "Error: Rust library not found at $$RUST_LIB" >&2 + exit 1 + fi + if [ ! -f $$RUST_BRIDGE_DIR/lib.rs.cc ]; then + echo "Error: cxxbridge CC file not found at $$RUST_BRIDGE_DIR/lib.rs.cc" >&2 + exit 1 + fi + if [ ! -f $$RUST_BRIDGE_DIR/lib.rs.h ]; then + echo "Error: cxxbridge header file not found at $$RUST_BRIDGE_DIR/lib.rs.h" >&2 + exit 1 + fi + cd $$EXECROOT + mkdir -p $$(dirname $$OUTPUT_SRC_H) $$(dirname $$OUTPUT_CXX_H) + cp $$RUST_LIB $$OUTPUT_LIB || (echo "Failed to copy $$RUST_LIB to $$OUTPUT_LIB" >&2; exit 1) + cp $$RUST_BRIDGE_DIR/lib.rs.cc $$OUTPUT_CC || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.cc to $$OUTPUT_CC" >&2; exit 1) + cp $$RUST_BRIDGE_DIR/lib.rs.h $$OUTPUT_H || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.h to $$OUTPUT_H" >&2; exit 1) + cp $$RUST_BRIDGE_DIR/lib.rs.h $$OUTPUT_SRC_H || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.h to $$OUTPUT_SRC_H" >&2; exit 1) + CXX_H_SOURCE=$$CARGO_TARGET_DIR/cxxbridge/rust/cxx.h + if [ ! -f $$CXX_H_SOURCE ] && [ ! -L $$CXX_H_SOURCE ]; then + echo "Error: cxx.h not found at $$CXX_H_SOURCE" >&2 + exit 1 + fi + cp -L $$CXX_H_SOURCE $$OUTPUT_CXX_H || (echo "Failed to copy $$CXX_H_SOURCE to $$OUTPUT_CXX_H" >&2; exit 1) + """, + message = "Building Rust library (release) with cargo...", + local = 1, +) + +filegroup( + name = "lib_rs_h_selected", + srcs = select({ + ":debug_mode": [":src/lib.rs_debug.h"], + ":fastbuild_mode": [":src/lib.rs_debug.h"], + ":release_mode": [":src/lib.rs_release.h"], + }), +) + +genrule( + name = "lib_rs_h_unified", + srcs = [":lib_rs_h_selected"], + outs = ["src/lib.rs.h"], + cmd = "cp $(location :lib_rs_h_selected) $(location src/lib.rs.h)", + message = "Unifying lib.rs.h for C++ includes", +) + +filegroup( + name = "rust_bridge_cc_selected", + srcs = select({ + ":debug_mode": [":rust_bridge_cc_debug.cc"], + ":fastbuild_mode": [":rust_bridge_cc_debug.cc"], + ":release_mode": [":rust_bridge_cc_release.cc"], + }), +) + +genrule( + name = "rust_bridge_cc_unified", + srcs = [":rust_bridge_cc_selected"], + outs = ["rust_bridge_cc.cc"], + cmd = "cp $(location :rust_bridge_cc_selected) $(location rust_bridge_cc.cc)", + message = "Unifying rust_bridge_cc.cc for C++ compilation", +) + +filegroup( + name = "rust_bridge_h_selected", + srcs = select({ + ":debug_mode": [":rust_bridge_h_debug.h"], + ":fastbuild_mode": [":rust_bridge_h_debug.h"], + ":release_mode": [":rust_bridge_h_release.h"], + }), +) + +genrule( + name = "rust_bridge_h_unified", + srcs = [":rust_bridge_h_selected"], + outs = ["rust_bridge_h.h"], + cmd = "cp $(location :rust_bridge_h_selected) $(location rust_bridge_h.h)", + message = "Unifying rust_bridge_h.h for C++ includes", +) + +filegroup( + name = "cxx_h_selected", + srcs = select({ + ":debug_mode": [":cxxbridge/rust/cxx_debug.h"], + ":fastbuild_mode": [":cxxbridge/rust/cxx_debug.h"], + ":release_mode": [":cxxbridge/rust/cxx_release.h"], + }), +) + +genrule( + name = "cxx_h_unified", + srcs = [":cxx_h_selected"], + outs = ["cxxbridge/rust/cxx.h"], + cmd = "mkdir -p $$(dirname $(location cxxbridge/rust/cxx.h)) && cp $(location :cxx_h_selected) $(location cxxbridge/rust/cxx.h)", + message = "Unifying cxx.h for C++ includes", +) + +cc_import( + name = "rust_lib", + static_library = select({ + ":debug_mode": ":rust_lib_debug.a", + ":fastbuild_mode": ":rust_lib_debug.a", + ":release_mode": ":rust_lib_release.a", + }), + alwayslink = True, +) + +cc_library( + name = "fluss_cpp", + srcs = [ + "src/admin.cpp", + "src/connection.cpp", + "src/table.cpp", + ":rust_bridge_cc_unified", + ], + hdrs = [ + "include/fluss.hpp", + ], + textual_hdrs = [ + "src/ffi_converter.hpp", + ":rust_bridge_h_unified", + ":lib_rs_h_unified", + ":cxx_h_unified", + ], + strip_include_prefix = "include", + copts = [ + "-std=c++17", + ] + select({ + ":debug_mode": [ + "-g3", + "-O0", + "-ggdb", + "-fno-omit-frame-pointer", + "-DDEBUG", + ], + ":fastbuild_mode": [ + "-g", + "-O0", + ], + ":release_mode": [ + "-O2", + "-DNDEBUG", + ], + }), + includes = [ + "src", + "cxxbridge", + ], + linkopts = [ + "-ldl", + "-lpthread", + ] + select({ + ":debug_mode": ["-g"], + ":fastbuild_mode": ["-g"], + ":release_mode": [], + }) + select({ + "@platforms//os:macos": [ + "-framework", "CoreFoundation", + "-framework", "Security", + ], + "//conditions:default": [], + }), + deps = [ + ":rust_lib", + ], + visibility = ["//visibility:public"], +) + +cc_binary( + name = "fluss_cpp_example", + srcs = [ + "examples/example.cpp", + ], + deps = [":fluss_cpp"], + copts = [ + "-std=c++17", + ] + select({ + ":debug_mode": [ + "-g3", + "-O0", + "-ggdb", + "-fno-omit-frame-pointer", + "-DDEBUG", + ], + ":fastbuild_mode": [ + "-g", + "-O0", + ], + ":release_mode": [ + "-O2", + "-DNDEBUG", + ], + }), + linkopts = select({ + ":debug_mode": ["-g"], + ":fastbuild_mode": ["-g"], + ":release_mode": [], + }), + visibility = ["//visibility:public"], +) + diff --git a/fluss-rust/bindings/cpp/MODULE.bazel b/fluss-rust/bindings/cpp/MODULE.bazel new file mode 100644 index 0000000000..f75d3e6f69 --- /dev/null +++ b/fluss-rust/bindings/cpp/MODULE.bazel @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module( + name = "fluss_cpp", +) + +bazel_dep(name = "rules_cc", version = "0.0.17") +bazel_dep(name = "platforms", version = "0.0.10") diff --git a/fluss-rust/bindings/cpp/ci.sh b/fluss-rust/bindings/cpp/ci.sh new file mode 100755 index 0000000000..b5eb9676b6 --- /dev/null +++ b/fluss-rust/bindings/cpp/ci.sh @@ -0,0 +1,100 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -xe + +DIR="$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd)" + +# Set Bazel output base to bazel-build directory +# This ensures all Bazel outputs are in bazel-build/.bazel-output-base +BAZEL_OUTPUT_BASE="$DIR/bazel-build/.bazel-output-base" + +# Create output base directory if it doesn't exist +mkdir -p "$BAZEL_OUTPUT_BASE" + +# Wrapper function to run bazel with --output_base +bazel() { + command bazel --output_base="$BAZEL_OUTPUT_BASE" "$@" +} + +compile() { + bazel build //:fluss_cpp +} + +build_example() { + bazel build //:fluss_cpp_example +} + +run_example() { + build_example + bazel run //:fluss_cpp_example +} + +clean() { + bazel clean + # Remove bazel-* symlinks (Bazel automatically creates these) + rm -f "$DIR"/bazel-* + # Also remove the bazel-build directory if it exists + if [ -d "$DIR/bazel-build" ]; then + rm -rf "$DIR/bazel-build" + fi + echo "Cleaned all Bazel outputs and symlinks" +} + +show_outputs() { + echo "=== Library outputs ===" + bazel cquery //:fluss_cpp --output=files 2>/dev/null || echo "Run 'bazel build //:fluss_cpp' first" + echo "" + echo "=== Example binary outputs ===" + bazel cquery //:fluss_cpp_example --output=files 2>/dev/null || echo "Run 'bazel build //:fluss_cpp_example' first" + echo "" + echo "=== To run the example ===" + echo " bazel run //:fluss_cpp_example" + echo "" + echo "=== To find outputs manually ===" + echo " bazel info bazel-bin" +} + +case $1 in + compile ) + compile + ;; + example ) + build_example + ;; + run ) + run_example + ;; + outputs ) + show_outputs + ;; + clean ) + clean + ;; + * ) + echo "Usage: $0 {compile|example|run|outputs|clean}" + echo "" + echo "Commands:" + echo " compile - Build the fluss_cpp library" + echo " example - Build the example binary" + echo " run - Build and run the example binary" + echo " outputs - Show the location of build outputs" + echo " clean - Clean all Bazel outputs" + exit 1 + ;; +esac From 9f834df90aa76b01849063ff302504e13ebea824 Mon Sep 17 00:00:00 2001 From: Kelvin Wu Date: Sat, 3 Jan 2026 14:45:54 +0800 Subject: [PATCH 049/287] feat: implement CompactedRowWriter (#121) --- .../src/row/compacted/compacted_row_writer.rs | 153 ++++++++++++++++++ .../crates/fluss/src/row/compacted/mod.rs | 1 + fluss-rust/crates/fluss/src/row/mod.rs | 2 + 3 files changed, 156 insertions(+) create mode 100644 fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs create mode 100644 fluss-rust/crates/fluss/src/row/compacted/mod.rs diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs new file mode 100644 index 0000000000..7c0addef34 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use bytes::{Bytes, BytesMut}; +use std::cmp; + +// Writer for CompactedRow +// Reference implementation: +// https://github.com/apache/fluss/blob/d4a72fad240d4b81563aaf83fa3b09b5058674ed/fluss-common/src/main/java/org/apache/fluss/row/compacted/CompactedRowWriter.java#L71 +pub struct CompactedRowWriter { + header_size_in_bytes: usize, + position: usize, + buffer: BytesMut, +} + +impl CompactedRowWriter { + pub const MAX_INT_SIZE: usize = 5; + pub const MAX_LONG_SIZE: usize = 10; + + pub fn new(field_count: usize) -> Self { + // bitset width in bytes, it should be in CompactedRow + let header_size = (field_count + 7) / 8; + let cap = cmp::max(64, header_size); + + let mut buffer = BytesMut::with_capacity(cap); + buffer.resize(cap, 0); + + Self { + header_size_in_bytes: header_size, + position: header_size, + buffer, + } + } + + pub fn reset(&mut self) { + self.position = self.header_size_in_bytes; + self.buffer[..self.header_size_in_bytes].fill(0); + } + + pub fn position(&self) -> usize { + self.position + } + + pub fn buffer(&self) -> &[u8] { + &self.buffer[..self.position] + } + + pub fn to_bytes(&self) -> Bytes { + Bytes::copy_from_slice(&self.buffer[..self.position]) + } + + fn ensure_capacity(&mut self, need_len: usize) { + if (self.buffer.len() - self.position) < need_len { + let new_len = cmp::max(self.buffer.len() * 2, self.buffer.len() + need_len); + self.buffer.resize(new_len, 0); + } + } + + fn write_raw(&mut self, src: &[u8]) { + let end = self.position + src.len(); + self.ensure_capacity(src.len()); + self.buffer[self.position..end].copy_from_slice(src); + self.position = end; + } + + pub fn set_null_at(&mut self, pos: usize) { + let byte_index = pos >> 3; + let bit = pos & 7; + debug_assert!(byte_index < self.header_size_in_bytes); + self.buffer[byte_index] |= 1u8 << bit; + } + + pub fn write_boolean(&mut self, value: bool) { + let b = if value { 1u8 } else { 0u8 }; + self.write_raw(&[b]); + } + + pub fn write_byte(&mut self, value: u8) { + self.write_raw(&[value as u8]); + } + + pub fn write_binary(&mut self, bytes: &[u8], length: usize) { + // TODO: currently, we encoding BINARY(length) as the same with BYTES, the length info can + // be omitted and the bytes length should be enforced in the future. + self.write_bytes(&bytes[..length.min(bytes.len())]); + } + + pub fn write_bytes(&mut self, value: &[u8]) { + let len_i32 = + i32::try_from(value.len()).expect("byte slice too large to encode length as i32"); + self.write_int(len_i32); + self.write_raw(value); + } + + pub fn write_char(&mut self, value: &str, length: usize) { + // TODO: currently, we encoding CHAR(length) as the same with STRING, the length info can be + // omitted and the bytes length should be enforced in the future. + self.write_string(value); + } + + pub fn write_string(&mut self, value: &str) { + self.write_bytes(value.as_ref()); + } + + pub fn write_short(&mut self, value: i16) { + self.write_raw(&value.to_ne_bytes()); + } + + pub fn write_int(&mut self, value: i32) { + self.ensure_capacity(Self::MAX_INT_SIZE); + let mut v = value as u32; + while (v & !0x7F) != 0 { + self.buffer[self.position] = ((v as u8) & 0x7F) | 0x80; + self.position += 1; + v >>= 7; + } + self.buffer[self.position] = v as u8; + self.position += 1; + } + pub fn write_long(&mut self, value: i64) { + self.ensure_capacity(Self::MAX_LONG_SIZE); + let mut v = value as u64; + while (v & !0x7F) != 0 { + self.buffer[self.position] = ((v as u8) & 0x7F) | 0x80; + self.position += 1; + v >>= 7; + } + self.buffer[self.position] = v as u8; + self.position += 1; + } + + pub fn write_float(&mut self, value: f32) { + self.write_raw(&value.to_ne_bytes()); + } + + pub fn write_double(&mut self, value: f64) { + self.write_raw(&value.to_ne_bytes()); + } +} diff --git a/fluss-rust/crates/fluss/src/row/compacted/mod.rs b/fluss-rust/crates/fluss/src/row/compacted/mod.rs new file mode 100644 index 0000000000..b9bc66b5cb --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/compacted/mod.rs @@ -0,0 +1 @@ +mod compacted_row_writer; diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index 01b89fc9f4..86fdf90c98 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -19,6 +19,8 @@ mod column; mod datum; +mod compacted; + pub use column::*; pub use datum::*; From 83c452563fabe07f53b596ecadd353fed1136ab7 Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Sat, 3 Jan 2026 15:07:55 +0800 Subject: [PATCH 050/287] chore: add license header to fix ci (#123) --- .../src/row/compacted/compacted_row_writer.rs | 8 +++++--- .../crates/fluss/src/row/compacted/mod.rs | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs index 7c0addef34..2debab19ae 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs @@ -21,19 +21,21 @@ use std::cmp; // Writer for CompactedRow // Reference implementation: // https://github.com/apache/fluss/blob/d4a72fad240d4b81563aaf83fa3b09b5058674ed/fluss-common/src/main/java/org/apache/fluss/row/compacted/CompactedRowWriter.java#L71 +#[allow(dead_code)] pub struct CompactedRowWriter { header_size_in_bytes: usize, position: usize, buffer: BytesMut, } +#[allow(dead_code)] impl CompactedRowWriter { pub const MAX_INT_SIZE: usize = 5; pub const MAX_LONG_SIZE: usize = 10; pub fn new(field_count: usize) -> Self { // bitset width in bytes, it should be in CompactedRow - let header_size = (field_count + 7) / 8; + let header_size = field_count.div_ceil(8); let cap = cmp::max(64, header_size); let mut buffer = BytesMut::with_capacity(cap); @@ -90,7 +92,7 @@ impl CompactedRowWriter { } pub fn write_byte(&mut self, value: u8) { - self.write_raw(&[value as u8]); + self.write_raw(&[value]); } pub fn write_binary(&mut self, bytes: &[u8], length: usize) { @@ -106,7 +108,7 @@ impl CompactedRowWriter { self.write_raw(value); } - pub fn write_char(&mut self, value: &str, length: usize) { + pub fn write_char(&mut self, value: &str, _length: usize) { // TODO: currently, we encoding CHAR(length) as the same with STRING, the length info can be // omitted and the bytes length should be enforced in the future. self.write_string(value); diff --git a/fluss-rust/crates/fluss/src/row/compacted/mod.rs b/fluss-rust/crates/fluss/src/row/compacted/mod.rs index b9bc66b5cb..695cdad988 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/mod.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/mod.rs @@ -1 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + mod compacted_row_writer; From 526d360f4420151ee99e78d7f5965c58c86c9345 Mon Sep 17 00:00:00 2001 From: Arnav_Panjla <146819303+Arnav-panjla@users.noreply.github.com> Date: Tue, 6 Jan 2026 11:38:02 +0530 Subject: [PATCH 051/287] feat: return error for partitioned tables in FlussConnection#get_table (#115) --- fluss-rust/crates/fluss/src/client/connection.rs | 5 +++++ fluss-rust/crates/fluss/src/error.rs | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/fluss-rust/crates/fluss/src/client/connection.rs b/fluss-rust/crates/fluss/src/client/connection.rs index 899ad597c1..595daf55f5 100644 --- a/fluss-rust/crates/fluss/src/client/connection.rs +++ b/fluss-rust/crates/fluss/src/client/connection.rs @@ -77,6 +77,11 @@ impl FlussConnection { pub async fn get_table(&self, table_path: &TablePath) -> Result> { self.metadata.update_table_metadata(table_path).await?; let table_info = self.metadata.get_cluster().get_table(table_path).clone(); + if table_info.is_partitioned() { + return Err(crate::error::Error::UnsupportedOperation { + message: "Partitioned tables are not supported".to_string(), + }); + } Ok(FlussTable::new(self, self.metadata.clone(), table_info)) } } diff --git a/fluss-rust/crates/fluss/src/error.rs b/fluss-rust/crates/fluss/src/error.rs index 0f4b1b6d11..e04fde14d7 100644 --- a/fluss-rust/crates/fluss/src/error.rs +++ b/fluss-rust/crates/fluss/src/error.rs @@ -98,6 +98,12 @@ pub enum Error { )] IoUnsupported { message: String }, + #[snafu( + visibility(pub(crate)), + display("Fluss hitting unsupported operation error {}.", message) + )] + UnsupportedOperation { message: String }, + #[snafu( visibility(pub(crate)), display("Fluss hitting leader not available error {}.", message) From 49c3043329c74eec947dc6602688b28b30557517 Mon Sep 17 00:00:00 2001 From: AlexZhao Date: Wed, 7 Jan 2026 13:50:27 +0800 Subject: [PATCH 052/287] chore: fix append hang issue (#126) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --------- Co-authored-by: 赵海源 --- .../fluss/src/client/write/accumulator.rs | 50 ++++++++++--------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs index 215adbe695..beae0caacc 100644 --- a/fluss-rust/crates/fluss/src/client/write/accumulator.rs +++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs @@ -299,34 +299,36 @@ impl RecordAccumulator { .batches .get(&table_bucket.bucket_id()) { - let mut batch = { + let mut maybe_batch = None; + { let mut batch_lock = deque.lock().await; - if batch_lock.is_empty() { - continue; + if !batch_lock.is_empty() { + let first_batch = batch_lock.front().unwrap(); + + if size + first_batch.estimated_size_in_bytes() > max_size as i64 + && !ready.is_empty() + { + // there is a rare case that a single batch size is larger than the request size + // due to compression; in this case we will still eventually send this batch in + // a single request. + break; + } + + maybe_batch = Some(batch_lock.pop_front().unwrap()); } - let first_batch = batch_lock.front().unwrap(); - - if size + first_batch.estimated_size_in_bytes() > max_size as i64 - && !ready.is_empty() - { - // there is a rare case that a single batch size is larger than the request size - // due to compression; in this case we will still eventually send this batch in - // a single request. - break; - } - - batch_lock.pop_front().unwrap() - }; + } - let current_batch_size = batch.estimated_size_in_bytes(); - size += current_batch_size; + if let Some(mut batch) = maybe_batch { + let current_batch_size = batch.estimated_size_in_bytes(); + size += current_batch_size; - // mark the batch as drained. - batch.drained(current_time_ms()); - ready.push(Arc::new(ReadyWriteBatch { - table_bucket, - write_batch: batch, - })); + // mark the batch as drained. + batch.drained(current_time_ms()); + ready.push(Arc::new(ReadyWriteBatch { + table_bucket, + write_batch: batch, + })); + } } } if current_index == start { From 6459f2cdbda3079ed75db1e3f9094655a43c84fb Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Fri, 9 Jan 2026 12:16:17 +0000 Subject: [PATCH 053/287] chore: Fix scanner's example documentation test to allow `cargo test --release` to complete (#129) --- .../crates/fluss/src/client/table/scanner.rs | 86 ++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index 0acaac890f..4255bb6841 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -73,7 +73,49 @@ impl<'a> TableScan<'a> { /// /// # Example /// ``` - /// let scanner = table.new_scan().project(&[0, 2, 3])?.create_log_scanner(); + /// # use fluss::client::FlussConnection; + /// # use fluss::config::Config; + /// # use fluss::error::Result; + /// # use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; + /// # use fluss::row::InternalRow; + /// # use std::time::Duration; + /// + /// # pub async fn example() -> Result<()> { + /// let mut config = Config::default(); + /// config.bootstrap_server = Some("127.0.0.1:9123".to_string()); + /// let conn = FlussConnection::new(config).await?; + /// + /// let table_descriptor = TableDescriptor::builder() + /// .schema( + /// Schema::builder() + /// .column("col1", DataTypes::int()) + /// .column("col2", DataTypes::string()) + /// .column("col3", DataTypes::string()) + /// .column("col3", DataTypes::string()) + /// .build()?, + /// ).build()?; + /// let table_path = TablePath::new("fluss".to_owned(), "rust_test_long".to_owned()); + /// let admin = conn.get_admin().await?; + /// admin.create_table(&table_path, &table_descriptor, true) + /// .await?; + /// let table_info = admin.get_table(&table_path).await?; + /// let table = conn.get_table(&table_path).await?; + /// + /// // Project columns by indices + /// let scanner = table.new_scan().project(&[0, 2, 3])?.create_log_scanner()?; + /// let scan_records = scanner.poll(Duration::from_secs(10)).await?; + /// for record in scan_records { + /// let row = record.row(); + /// println!( + /// "{{{}, {}, {}}}@{}", + /// row.get_int(0), + /// row.get_string(2), + /// row.get_string(3), + /// record.offset() + /// ); + /// } + /// # Ok(()) + /// # } /// ``` pub fn project(mut self, column_indices: &[usize]) -> Result { if column_indices.is_empty() { @@ -107,7 +149,47 @@ impl<'a> TableScan<'a> { /// /// # Example /// ``` - /// let scanner = table.new_scan().project_by_name(&["col1", "col3"])?.create_log_scanner(); + /// # use fluss::client::FlussConnection; + /// # use fluss::config::Config; + /// # use fluss::error::Result; + /// # use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; + /// # use fluss::row::InternalRow; + /// # use std::time::Duration; + /// + /// # pub async fn example() -> Result<()> { + /// let mut config = Config::default(); + /// config.bootstrap_server = Some("127.0.0.1:9123".to_string()); + /// let conn = FlussConnection::new(config).await?; + /// + /// let table_descriptor = TableDescriptor::builder() + /// .schema( + /// Schema::builder() + /// .column("col1", DataTypes::int()) + /// .column("col2", DataTypes::string()) + /// .column("col3", DataTypes::string()) + /// .build()?, + /// ).build()?; + /// let table_path = TablePath::new("fluss".to_owned(), "rust_test_long".to_owned()); + /// let admin = conn.get_admin().await?; + /// admin.create_table(&table_path, &table_descriptor, true) + /// .await?; + /// let table_info = admin.get_table(&table_path).await?; + /// let table = conn.get_table(&table_path).await?; + /// + /// // Project columns by column names + /// let scanner = table.new_scan().project_by_name(&["col1", "col3"])?.create_log_scanner()?; + /// let scan_records = scanner.poll(Duration::from_secs(10)).await?; + /// for record in scan_records { + /// let row = record.row(); + /// println!( + /// "{{{}, {}}}@{}", + /// row.get_int(0), + /// row.get_string(1), + /// record.offset() + /// ); + /// } + /// # Ok(()) + /// # } /// ``` pub fn project_by_name(mut self, column_names: &[&str]) -> Result { if column_names.is_empty() { From 305a448736a6121a50b7855893de2d5c89f1e873 Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Fri, 9 Jan 2026 12:56:59 +0000 Subject: [PATCH 054/287] feat: introduce CompactedKeyEncoder (#124) --- fluss-rust/crates/fluss/Cargo.toml | 1 + .../crates/fluss/src/metadata/datatype.rs | 30 ++ .../fluss/src/row/binary/binary_writer.rs | 210 +++++++++++ fluss-rust/crates/fluss/src/row/binary/mod.rs | 28 ++ .../src/row/compacted/compacted_key_writer.rs | 97 ++++++ .../crates/fluss/src/row/compacted/mod.rs | 3 + fluss-rust/crates/fluss/src/row/datum.rs | 10 + .../src/row/encode/compacted_key_encoder.rs | 329 ++++++++++++++++++ fluss-rust/crates/fluss/src/row/encode/mod.rs | 64 ++++ .../crates/fluss/src/row/field_getter.rs | 116 ++++++ fluss-rust/crates/fluss/src/row/mod.rs | 9 + 11 files changed, 897 insertions(+) create mode 100644 fluss-rust/crates/fluss/src/row/binary/binary_writer.rs create mode 100644 fluss-rust/crates/fluss/src/row/binary/mod.rs create mode 100644 fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs create mode 100644 fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs create mode 100644 fluss-rust/crates/fluss/src/row/encode/mod.rs create mode 100644 fluss-rust/crates/fluss/src/row/field_getter.rs diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index 27604eecd8..e8c851f7b7 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -59,6 +59,7 @@ uuid = { version = "1.10", features = ["v4"] } tempfile = "3.23.0" snafu = "0.8.3" scopeguard = "1.2.0" +delegate = "0.13.5" [target.'cfg(target_arch = "wasm32")'.dependencies] jiff = { workspace = true, features = ["js"] } diff --git a/fluss-rust/crates/fluss/src/metadata/datatype.rs b/fluss-rust/crates/fluss/src/metadata/datatype.rs index 8ad4f7e569..e5ccb9a8e9 100644 --- a/fluss-rust/crates/fluss/src/metadata/datatype.rs +++ b/fluss-rust/crates/fluss/src/metadata/datatype.rs @@ -852,6 +852,36 @@ impl RowType { pub fn fields(&self) -> &Vec { &self.fields } + + pub fn get_field_index(&self, field_name: &str) -> Option { + self.fields.iter().position(|f| f.name == field_name) + } + + #[cfg(test)] + pub fn with_data_types(data_types: Vec) -> Self { + let mut fields: Vec = Vec::new(); + data_types.iter().enumerate().for_each(|(idx, data_type)| { + fields.push(DataField::new(format!("f{}", idx), data_type.clone(), None)); + }); + + Self::with_nullable(true, fields) + } + + #[cfg(test)] + pub fn with_data_types_and_field_names( + data_types: Vec, + field_names: Vec<&str>, + ) -> Self { + let fields = data_types + .into_iter() + .zip(field_names) + .map(|(data_type, field_name)| { + DataField::new(field_name.to_string(), data_type.clone(), None) + }) + .collect::>(); + + Self::with_nullable(true, fields) + } } impl Display for RowType { diff --git a/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs b/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs new file mode 100644 index 0000000000..a296777a30 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::DataType; +use crate::row::Datum; +use crate::row::binary::BinaryRowFormat; + +/// Writer to write a composite data format, like row, array, +#[allow(dead_code)] +pub trait BinaryWriter { + /// Reset writer to prepare next write + fn reset(&mut self); + + /// Set null to this field + fn set_null_at(&mut self, pos: usize); + + fn write_boolean(&mut self, value: bool); + + fn write_byte(&mut self, value: u8); + + fn write_bytes(&mut self, value: &[u8]); + + fn write_char(&mut self, value: &str, length: usize); + + fn write_string(&mut self, value: &str); + + fn write_short(&mut self, value: i16); + + fn write_int(&mut self, value: i32); + + fn write_long(&mut self, value: i64); + + fn write_float(&mut self, value: f32); + + fn write_double(&mut self, value: f64); + + fn write_binary(&mut self, bytes: &[u8], length: usize); + + // TODO Decimal type + // fn write_decimal(&mut self, pos: i32, value: f64); + + // TODO Timestamp type + // fn write_timestamp_ntz(&mut self, pos: i32, value: i64); + + // TODO Timestamp type + // fn write_timestamp_ltz(&mut self, pos: i32, value: i64); + + // TODO InternalArray, ArraySerializer + // fn write_array(&mut self, pos: i32, value: i64); + + // TODO Row serializer + // fn write_row(&mut self, pos: i32, value: &InternalRow); + + /// Finally, complete write to set real size to binary. + fn complete(&mut self); +} + +pub enum ValueWriter { + Nullable(InnerValueWriter), + NonNullable(InnerValueWriter), +} + +impl ValueWriter { + pub fn create_value_writer( + element_type: &DataType, + binary_row_format: Option<&BinaryRowFormat>, + ) -> Result { + let value_writer = + InnerValueWriter::create_inner_value_writer(element_type, binary_row_format)?; + if element_type.is_nullable() { + Ok(Self::Nullable(value_writer)) + } else { + Ok(Self::NonNullable(value_writer)) + } + } + + pub fn write_value( + &self, + writer: &mut W, + pos: usize, + value: &Datum, + ) -> Result<()> { + match self { + Self::Nullable(inner_value_writer) => { + if let Datum::Null = value { + writer.set_null_at(pos); + Ok(()) + } else { + inner_value_writer.write_value(writer, pos, value) + } + } + Self::NonNullable(inner_value_writer) => { + inner_value_writer.write_value(writer, pos, value) + } + } + } +} + +#[derive(Debug)] +pub enum InnerValueWriter { + Char, + String, + Boolean, + Binary, + Bytes, + TinyInt, + SmallInt, + Int, + BigInt, + Float, + Double, + // TODO Decimal, Date, TimeWithoutTimeZone, TimestampWithoutTimeZone, TimestampWithLocalTimeZone, Array, Row +} + +/// Accessor for writing the fields/elements of a binary writer during runtime, the +/// fields/elements must be written in the order. +impl InnerValueWriter { + pub fn create_inner_value_writer( + data_type: &DataType, + _: Option<&BinaryRowFormat>, + ) -> Result { + match data_type { + DataType::Char(_) => Ok(InnerValueWriter::Char), + DataType::String(_) => Ok(InnerValueWriter::String), + DataType::Boolean(_) => Ok(InnerValueWriter::Boolean), + DataType::Binary(_) => Ok(InnerValueWriter::Binary), + DataType::Bytes(_) => Ok(InnerValueWriter::Bytes), + DataType::TinyInt(_) => Ok(InnerValueWriter::TinyInt), + DataType::SmallInt(_) => Ok(InnerValueWriter::SmallInt), + DataType::Int(_) => Ok(InnerValueWriter::Int), + DataType::BigInt(_) => Ok(InnerValueWriter::BigInt), + DataType::Float(_) => Ok(InnerValueWriter::Float), + DataType::Double(_) => Ok(InnerValueWriter::Double), + _ => unimplemented!( + "ValueWriter for DataType {:?} is currently not implemented", + data_type + ), + } + } + pub fn write_value( + &self, + writer: &mut W, + _pos: usize, + value: &Datum, + ) -> Result<()> { + match (self, value) { + (InnerValueWriter::Char, Datum::String(v)) => { + writer.write_char(v, v.len()); + } + (InnerValueWriter::String, Datum::String(v)) => { + writer.write_string(v); + } + (InnerValueWriter::Boolean, Datum::Bool(v)) => { + writer.write_boolean(*v); + } + (InnerValueWriter::Binary, Datum::Blob(v)) => { + writer.write_binary(v.as_ref(), v.len()); + } + (InnerValueWriter::Binary, Datum::BorrowedBlob(v)) => { + writer.write_binary(v.as_ref(), v.len()); + } + (InnerValueWriter::Bytes, Datum::Blob(v)) => { + writer.write_bytes(v.as_ref()); + } + (InnerValueWriter::Bytes, Datum::BorrowedBlob(v)) => { + writer.write_bytes(v.as_ref()); + } + (InnerValueWriter::TinyInt, Datum::Int8(v)) => { + writer.write_byte(*v as u8); + } + (InnerValueWriter::SmallInt, Datum::Int16(v)) => { + writer.write_short(*v); + } + (InnerValueWriter::Int, Datum::Int32(v)) => { + writer.write_int(*v); + } + (InnerValueWriter::BigInt, Datum::Int64(v)) => { + writer.write_long(*v); + } + (InnerValueWriter::Float, Datum::Float32(v)) => { + writer.write_float(v.into_inner()); + } + (InnerValueWriter::Double, Datum::Float64(v)) => { + writer.write_double(v.into_inner()); + } + _ => { + return Err(IllegalArgument { + message: format!("{:?} used to write value {:?}", self, value), + }); + } + } + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/row/binary/mod.rs b/fluss-rust/crates/fluss/src/row/binary/mod.rs new file mode 100644 index 0000000000..c31cbd59c1 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/binary/mod.rs @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod binary_writer; + +pub use binary_writer::*; + +/// The binary row format types, it indicates the generated [`BinaryRow`] type by the [`BinaryWriter`] +#[allow(dead_code)] +pub enum BinaryRowFormat { + Compacted, + Aligned, + Indexed, +} diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs new file mode 100644 index 0000000000..84a6b22724 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::row::compacted::compacted_row_writer::CompactedRowWriter; +use bytes::Bytes; + +use crate::error::Result; +use crate::metadata::DataType; +use crate::row::binary::{BinaryRowFormat, BinaryWriter, ValueWriter}; +use delegate::delegate; + +/// A wrapping of [`CompactedRowWriter`] used to encode key columns. +/// The encoding is the same as [`CompactedRowWriter`], but is without header of null bits to +/// represent whether the field value is null or not since the key columns must be not null. +pub struct CompactedKeyWriter { + delegate: CompactedRowWriter, +} + +impl CompactedKeyWriter { + pub fn new() -> CompactedKeyWriter { + CompactedKeyWriter { + // in compacted key encoder, we don't need to set null bits as the key columns must be not + // null, to use field count 0 to init to make the null bits 0 + delegate: CompactedRowWriter::new(0), + } + } + + pub fn create_value_writer(field_type: &DataType) -> Result { + ValueWriter::create_value_writer(field_type, Some(&BinaryRowFormat::Compacted)) + } + + delegate! { + to self.delegate { + pub fn reset(&mut self); + + #[allow(dead_code)] + pub fn position(&self) -> usize; + + #[allow(dead_code)] + pub fn buffer(&self) -> &[u8]; + + pub fn to_bytes(&self) -> Bytes; + } + } +} + +impl BinaryWriter for CompactedKeyWriter { + delegate! { + to self.delegate { + fn reset(&mut self); + + fn set_null_at(&mut self, pos: usize); + + fn write_boolean(&mut self, value: bool); + + fn write_byte(&mut self, value: u8); + + fn write_binary(&mut self, bytes: &[u8], length: usize); + + fn write_bytes(&mut self, value: &[u8]); + + fn write_char(&mut self, value: &str, _length: usize); + + fn write_string(&mut self, value: &str); + + fn write_short(&mut self, value: i16); + + fn write_int(&mut self, value: i32); + + fn write_long(&mut self, value: i64); + + fn write_float(&mut self, value: f32); + + fn write_double(&mut self, value: f64); + + + } + } + + fn complete(&mut self) { + // do nothing + } +} diff --git a/fluss-rust/crates/fluss/src/row/compacted/mod.rs b/fluss-rust/crates/fluss/src/row/compacted/mod.rs index 695cdad988..c81eb5a50b 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/mod.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/mod.rs @@ -15,4 +15,7 @@ // specific language governing permissions and limitations // under the License. +mod compacted_key_writer; mod compacted_row_writer; + +pub use compacted_key_writer::CompactedKeyWriter; diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs index 1ea393349e..28a378fd56 100644 --- a/fluss-rust/crates/fluss/src/row/datum.rs +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -55,6 +55,8 @@ pub enum Datum<'a> { String(&'a str), #[display("{0}")] Blob(Blob), + #[display("{:?}")] + BorrowedBlob(&'a [u8]), #[display("{0}")] Decimal(Decimal), #[display("{0}")] @@ -80,6 +82,7 @@ impl Datum<'_> { pub fn as_blob(&self) -> &[u8] { match self { Self::Blob(blob) => blob.as_ref(), + Self::BorrowedBlob(blob) => blob, _ => panic!("not a blob: {self:?}"), } } @@ -289,6 +292,7 @@ impl Datum<'_> { Datum::Float64(v) => append_value_to_arrow!(Float64Builder, v.into_inner()), Datum::String(v) => append_value_to_arrow!(StringBuilder, *v), Datum::Blob(v) => append_value_to_arrow!(BinaryBuilder, v.as_ref()), + Datum::BorrowedBlob(v) => append_value_to_arrow!(BinaryBuilder, *v), Datum::Decimal(_) | Datum::Date(_) | Datum::Timestamp(_) | Datum::TimestampTz(_) => { return Err(RowConvertError { message: format!( @@ -406,6 +410,12 @@ impl From> for Blob { } } +impl<'a> From<&'a [u8]> for Datum<'a> { + fn from(bytes: &'a [u8]) -> Datum<'a> { + Datum::BorrowedBlob(bytes) + } +} + const UNIX_EPOCH_DAY: jiff::civil::Date = jiff::civil::date(1970, 1, 1); impl Date { diff --git a/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs b/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs new file mode 100644 index 0000000000..b9335a3c13 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs @@ -0,0 +1,329 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::RowType; +use crate::row::binary::ValueWriter; +use crate::row::compacted::CompactedKeyWriter; +use crate::row::encode::KeyEncoder; +use crate::row::field_getter::FieldGetter; +use crate::row::{Datum, InternalRow}; +use bytes::Bytes; + +#[allow(dead_code)] +pub struct CompactedKeyEncoder { + field_getters: Vec, + field_encoders: Vec, + compacted_encoder: CompactedKeyWriter, +} + +impl CompactedKeyEncoder { + /// Create a key encoder to encode the key of the input row. + /// + /// # Arguments + /// * `row_type` - the row type of the input row + /// * `keys` - the key fields to encode + /// + /// # Returns + /// * key_encoder - the [`KeyEncoder`] + pub fn create_key_encoder(row_type: &RowType, keys: &[String]) -> Result { + let mut encode_col_indexes = Vec::with_capacity(keys.len()); + + for key in keys { + match row_type.get_field_index(key) { + Some(idx) => encode_col_indexes.push(idx), + None => { + return Err(IllegalArgument { + message: format!( + "Field {:?} not found in input row type {:?}", + key, row_type + ), + }); + } + } + } + + Self::new(row_type, encode_col_indexes) + } + + pub fn new(row_type: &RowType, encode_field_pos: Vec) -> Result { + let mut field_getters: Vec = Vec::with_capacity(encode_field_pos.len()); + let mut field_encoders: Vec = Vec::with_capacity(encode_field_pos.len()); + + for pos in &encode_field_pos { + let data_type = row_type.fields().get(*pos).unwrap().data_type(); + field_getters.push(FieldGetter::create(data_type, *pos)); + field_encoders.push(CompactedKeyWriter::create_value_writer(data_type)?); + } + + Ok(CompactedKeyEncoder { + field_encoders, + field_getters, + compacted_encoder: CompactedKeyWriter::new(), + }) + } +} + +#[allow(dead_code)] +impl KeyEncoder for CompactedKeyEncoder { + fn encode_key(&mut self, row: &dyn InternalRow) -> Result { + self.compacted_encoder.reset(); + + // iterate all the fields of the row, and encode each field + for (pos, field_getter) in self.field_getters.iter().enumerate() { + match &field_getter.get_field(row) { + Datum::Null => { + return Err(IllegalArgument { + message: format!( + "Cannot encode key with null value at position: {:?}", + pos + ), + }); + } + value => self.field_encoders.get(pos).unwrap().write_value( + &mut self.compacted_encoder, + pos, + value, + )?, + } + } + + Ok(self.compacted_encoder.to_bytes()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::DataTypes; + use crate::row::{Datum, GenericRow}; + + pub fn for_test_row_type(row_type: &RowType) -> CompactedKeyEncoder { + CompactedKeyEncoder::new(row_type, (0..row_type.fields().len()).collect()) + .expect("CompactedKeyEncoder initialization failed") + } + + #[test] + fn test_encode_key() { + let row_type = RowType::with_data_types(vec![ + DataTypes::int(), + DataTypes::bigint(), + DataTypes::int(), + ]); + let row = GenericRow::from_data(vec![ + Datum::from(1i32), + Datum::from(3i64), + Datum::from(2i32), + ]); + + let mut encoder = for_test_row_type(&row_type); + + assert_eq!( + encoder.encode_key(&row).unwrap().iter().as_slice(), + [1u8, 3u8, 2u8] + ); + + let row = GenericRow::from_data(vec![ + Datum::from(2i32), + Datum::from(5i64), + Datum::from(6i32), + ]); + + assert_eq!( + encoder.encode_key(&row).unwrap().iter().as_slice(), + [2u8, 5u8, 6u8] + ); + } + + #[test] + fn test_encode_key_with_key_names() { + let data_types = vec![ + DataTypes::string(), + DataTypes::bigint(), + DataTypes::string(), + ]; + let field_names = vec!["partition", "f1", "f2"]; + + let row_type = RowType::with_data_types_and_field_names(data_types, field_names); + + let primary_keys = &["f2".to_string()]; + + let mut encoder = CompactedKeyEncoder::create_key_encoder(&row_type, primary_keys).unwrap(); + + let row = GenericRow::from_data(vec![ + Datum::from("p1"), + Datum::from(1i64), + Datum::from("a2"), + ]); + + // should only get "a2" 's ASCII representation + assert_eq!( + encoder.encode_key(&row).unwrap().iter().as_slice(), + // 2 (start of text), 97 (the letter a), 50 (the number 2) + [2u8, 97u8, 50u8] + ); + } + + #[test] + #[should_panic(expected = "Cannot encode key with null value at position: 2")] + fn test_null_primary_key() { + let row_type = RowType::with_data_types(vec![ + DataTypes::int(), + DataTypes::bigint(), + DataTypes::int(), + DataTypes::string(), + ]); + + let primary_key_indices = vec![0, 1, 2]; + + let mut encoder = CompactedKeyEncoder::new(&row_type, primary_key_indices) + .expect("CompactedKeyEncoder initialization failed"); + + let row = GenericRow::from_data(vec![ + Datum::from(1i32), + Datum::from(3i64), + Datum::from(2i32), + Datum::from("a2"), + ]); + + assert_eq!( + encoder.encode_key(&row).unwrap().iter().as_slice(), + [1u8, 3u8, 2u8] + ); + + let row = GenericRow::from_data(vec![ + Datum::from(1i32), + Datum::from(3i64), + Datum::Null, + Datum::from("a2"), + ]); + + encoder.encode_key(&row).unwrap(); + } + + #[test] + fn test_int_string_as_primary_key() { + let row_type = RowType::with_data_types(vec![ + DataTypes::string(), + DataTypes::int(), + DataTypes::string(), + DataTypes::string(), + ]); + + let primary_key_indices = vec![1, 2]; + let mut encoder = CompactedKeyEncoder::new(&row_type, primary_key_indices) + .expect("CompactedKeyEncoder initialization failed"); + + let row = GenericRow::from_data(vec![ + Datum::from("a1"), + Datum::from(1i32), + Datum::from("a2"), + Datum::from("a3"), + ]); + + assert_eq!( + encoder.encode_key(&row).unwrap().iter().as_slice(), + // 1 (1i32), 2 (start of text), 97 (the letter a), 50 (the number 2) + [1u8, 2u8, 97u8, 50u8] + ); + } + + #[test] + fn test_all_data_types() { + let row_type = RowType::with_data_types(vec![ + DataTypes::boolean(), + DataTypes::tinyint(), + DataTypes::smallint(), + DataTypes::int(), + DataTypes::bigint(), + DataTypes::float(), + DataTypes::double(), + // TODO Date + // TODO Time + DataTypes::binary(20), + DataTypes::bytes(), + DataTypes::char(2), + DataTypes::string(), + // TODO Decimal + // TODO Timestamp + // TODO Timestamp LTZ + // TODO Array of Int + // TODO Array of Float + // TODO Array of String + // TODO: Add Map and Row fields in Issue #1973 + ]); + + let row = GenericRow::from_data(vec![ + Datum::from(true), + Datum::from(2i8), + Datum::from(10i16), + Datum::from(100i32), + Datum::from(-6101065172474983726i64), // from Java test case: new BigInteger("12345678901234567890").longValue() + Datum::from(13.2f32), + Datum::from(15.21f64), + // TODO Date + // TODO Time + Datum::from("1234567890".as_bytes()), + Datum::from("20".as_bytes()), + Datum::from("1"), + Datum::from("hello"), + // TODO Decimal + // TODO Timestamp + // TODO Timestamp LTZ + // TODO Array of Int + // TODO Array of Float + // TODO Array of String + // TODO: Add Map and Row fields in Issue #1973 + ]); + + let mut encoder = for_test_row_type(&row_type); + + let mut expected: Vec = Vec::new(); + // BOOLEAN: true + expected.extend(vec![0x01]); + // TINYINT: 2 + expected.extend(vec![0x02]); + // SMALLINT: 10 + expected.extend(vec![0x0A]); + // INT: 100 + expected.extend(vec![0x00, 0x64]); + // BIGINT: -6101065172474983726 + expected.extend(vec![ + 0xD2, 0x95, 0xFC, 0xD8, 0xCE, 0xB1, 0xAA, 0xAA, 0xAB, 0x01, + ]); + // FLOAT: 13.2 + expected.extend(vec![0x33, 0x33, 0x53, 0x41]); + // DOUBLE: 15.21 + expected.extend(vec![0xEC, 0x51, 0xB8, 0x1E, 0x85, 0x6B, 0x2E, 0x40]); + // BINARY(20): "1234567890".getBytes() + expected.extend(vec![ + 0x0A, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, + ]); + + // BYTES: "20".getBytes() + expected.extend(vec![0x02, 0x32, 0x30]); + // CHAR(2): "1" + expected.extend(vec![0x01, 0x31]); + // STRING: String: "hello" + expected.extend(vec![0x05, 0x68, 0x65, 0x6C, 0x6C, 0x6F]); + assert_eq!( + encoder.encode_key(&row).unwrap().iter().as_slice(), + expected.as_slice() + ); + } +} diff --git a/fluss-rust/crates/fluss/src/row/encode/mod.rs b/fluss-rust/crates/fluss/src/row/encode/mod.rs new file mode 100644 index 0000000000..6c6eed9936 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/encode/mod.rs @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod compacted_key_encoder; + +use crate::error::Result; +use crate::metadata::{DataLakeFormat, RowType}; +use crate::row::InternalRow; +use crate::row::encode::compacted_key_encoder::CompactedKeyEncoder; +use bytes::Bytes; + +/// An interface for encoding key of row into bytes. +#[allow(dead_code)] +pub trait KeyEncoder { + fn encode_key(&mut self, row: &dyn InternalRow) -> Result; +} + +#[allow(dead_code)] +impl dyn KeyEncoder { + /// Create a key encoder to encode the key bytes of the input row. + /// # Arguments + /// * `row_type` - the row type of the input row + /// * `key_fields` - the key fields to encode + /// * `lake_format` - the data lake format + /// + /// # Returns + /// key encoder + pub fn of( + row_type: &RowType, + key_fields: Vec, + data_lake_format: Option, + ) -> Result> { + match data_lake_format { + Some(DataLakeFormat::Paimon) => { + unimplemented!("KeyEncoder for Paimon format is currently unimplemented") + } + Some(DataLakeFormat::Lance) => Ok(Box::new(CompactedKeyEncoder::create_key_encoder( + row_type, + key_fields.as_slice(), + )?)), + Some(DataLakeFormat::Iceberg) => { + unimplemented!("KeyEncoder for Iceberg format is currently unimplemented") + } + None => Ok(Box::new(CompactedKeyEncoder::create_key_encoder( + row_type, + key_fields.as_slice(), + )?)), + } + } +} diff --git a/fluss-rust/crates/fluss/src/row/field_getter.rs b/fluss-rust/crates/fluss/src/row/field_getter.rs new file mode 100644 index 0000000000..3a9cf0fa81 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/field_getter.rs @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::metadata::DataType; +use crate::row::{Datum, InternalRow}; + +pub enum FieldGetter { + Nullable(InnerFieldGetter), + NonNullable(InnerFieldGetter), +} +impl FieldGetter { + pub fn get_field<'a>(&self, row: &'a dyn InternalRow) -> Datum<'a> { + match self { + FieldGetter::Nullable(getter) => { + if row.is_null_at(getter.pos()) { + Datum::Null + } else { + getter.get_field(row) + } + } + FieldGetter::NonNullable(getter) => getter.get_field(row), + } + } + + pub fn create(data_type: &DataType, pos: usize) -> FieldGetter { + let inner_field_getter = match data_type { + DataType::Char(t) => InnerFieldGetter::Char { + pos, + len: t.length() as usize, + }, + DataType::String(_) => InnerFieldGetter::String { pos }, + DataType::Boolean(_) => InnerFieldGetter::Bool { pos }, + DataType::Binary(t) => InnerFieldGetter::Binary { + pos, + len: t.length(), + }, + DataType::Bytes(_) => InnerFieldGetter::Bytes { pos }, + DataType::TinyInt(_) => InnerFieldGetter::TinyInt { pos }, + DataType::SmallInt(_) => InnerFieldGetter::SmallInt { pos }, + DataType::Int(_) => InnerFieldGetter::Int { pos }, + DataType::BigInt(_) => InnerFieldGetter::BigInt { pos }, + DataType::Float(_) => InnerFieldGetter::Float { pos }, + DataType::Double(_) => InnerFieldGetter::Double { pos }, + _ => unimplemented!("DataType {:?} is currently unimplemented", data_type), + }; + + if data_type.is_nullable() { + Self::Nullable(inner_field_getter) + } else { + Self::NonNullable(inner_field_getter) + } + } +} + +pub enum InnerFieldGetter { + Char { pos: usize, len: usize }, + String { pos: usize }, + Bool { pos: usize }, + Binary { pos: usize, len: usize }, + Bytes { pos: usize }, + TinyInt { pos: usize }, + SmallInt { pos: usize }, + Int { pos: usize }, + BigInt { pos: usize }, + Float { pos: usize }, + Double { pos: usize }, +} + +impl InnerFieldGetter { + pub fn get_field<'a>(&self, row: &'a dyn InternalRow) -> Datum<'a> { + match self { + InnerFieldGetter::Char { pos, len } => Datum::String(row.get_char(*pos, *len)), + InnerFieldGetter::String { pos } => Datum::from(row.get_string(*pos)), + InnerFieldGetter::Bool { pos } => Datum::from(row.get_boolean(*pos)), + InnerFieldGetter::Binary { pos, len } => Datum::from(row.get_binary(*pos, *len)), + InnerFieldGetter::Bytes { pos } => Datum::from(row.get_bytes(*pos)), + InnerFieldGetter::TinyInt { pos } => Datum::from(row.get_byte(*pos)), + InnerFieldGetter::SmallInt { pos } => Datum::from(row.get_short(*pos)), + InnerFieldGetter::Int { pos } => Datum::from(row.get_int(*pos)), + InnerFieldGetter::BigInt { pos } => Datum::from(row.get_long(*pos)), + InnerFieldGetter::Float { pos } => Datum::from(row.get_float(*pos)), + InnerFieldGetter::Double { pos } => Datum::from(row.get_double(*pos)), + //TODO Decimal, Date, Time, Timestamp, TimestampLTZ, Array, Map, Row + } + } + + pub fn pos(&self) -> usize { + match self { + Self::Char { pos, .. } + | Self::String { pos } + | Self::Bool { pos } + | Self::Binary { pos, .. } + | Self::Bytes { pos } + | Self::TinyInt { pos } + | Self::SmallInt { pos, .. } + | Self::Int { pos } + | Self::BigInt { pos } + | Self::Float { pos, .. } + | Self::Double { pos } => *pos, + } + } +} diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index 86fdf90c98..c321ab9d6b 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -19,11 +19,15 @@ mod column; mod datum; +mod binary; mod compacted; +mod encode; +mod field_getter; pub use column::*; pub use datum::*; +// TODO make functions return Result for better error handling pub trait InternalRow { /// Returns the number of fields in this row fn get_field_count(&self) -> usize; @@ -143,6 +147,11 @@ impl<'a> Default for GenericRow<'a> { } impl<'a> GenericRow<'a> { + pub fn from_data(data: Vec>>) -> GenericRow<'a> { + GenericRow { + values: data.into_iter().map(Into::into).collect(), + } + } pub fn new() -> GenericRow<'a> { GenericRow { values: vec![] } } From 1f547950559a6be6ae26393f7ba16389f0ed9ff8 Mon Sep 17 00:00:00 2001 From: Anton Borisov <148864665+fresh-borzoni@users.noreply.github.com> Date: Sat, 10 Jan 2026 02:00:12 +0000 Subject: [PATCH 055/287] feat: log scanner support poll record batch directly --- .../src/client/table/log_fetch_buffer.rs | 54 +++- .../crates/fluss/src/client/table/mod.rs | 2 +- .../crates/fluss/src/client/table/scanner.rs | 236 ++++++++++++++++-- fluss-rust/crates/fluss/src/record/arrow.rs | 22 ++ .../crates/fluss/tests/integration/table.rs | 133 ++++++++++ 5 files changed, 430 insertions(+), 17 deletions(-) diff --git a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs index cee104e020..e9bac53f1a 100644 --- a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs +++ b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs @@ -15,12 +15,14 @@ // specific language governing permissions and limitations // under the License. +use arrow::array::RecordBatch; +use parking_lot::Mutex; + use crate::error::Result; use crate::metadata::TableBucket; use crate::record::{ LogRecordBatch, LogRecordIterator, LogRecordsBatches, ReadContext, ScanRecord, }; -use parking_lot::Mutex; use std::collections::{HashMap, VecDeque}; use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; @@ -31,6 +33,7 @@ use tokio::sync::Notify; pub trait CompletedFetch: Send + Sync { fn table_bucket(&self) -> &TableBucket; fn fetch_records(&mut self, max_records: usize) -> Result>; + fn fetch_batches(&mut self, max_batches: usize) -> Result>; fn is_consumed(&self) -> bool; fn drain(&mut self); fn size_in_bytes(&self) -> usize; @@ -318,6 +321,38 @@ impl DefaultCompletedFetch { } } } + + /// Get the next batch directly without row iteration + fn next_fetched_batch(&mut self) -> Result> { + loop { + let Some(log_batch) = self.log_record_batch.next() else { + self.drain(); + return Ok(None); + }; + + let mut record_batch = log_batch.record_batch(&self.read_context)?; + + // Skip empty batches + if record_batch.num_rows() == 0 { + continue; + } + + // Truncate batch + let base_offset = log_batch.base_log_offset(); + if self.next_fetch_offset > base_offset { + let skip_count = (self.next_fetch_offset - base_offset) as usize; + if skip_count >= record_batch.num_rows() { + continue; + } + // Slice the batch to skip the first skip_count rows + record_batch = record_batch.slice(skip_count, record_batch.num_rows() - skip_count); + } + + self.next_fetch_offset = log_batch.next_log_offset(); + self.records_read += record_batch.num_rows(); + return Ok(Some(record_batch)); + } + } } impl CompletedFetch for DefaultCompletedFetch { @@ -346,6 +381,23 @@ impl CompletedFetch for DefaultCompletedFetch { Ok(scan_records) } + fn fetch_batches(&mut self, max_batches: usize) -> Result> { + if self.consumed { + return Ok(Vec::new()); + } + + let mut batches = Vec::with_capacity(max_batches.min(16)); + + for _ in 0..max_batches { + match self.next_fetched_batch()? { + Some(batch) => batches.push(batch), + None => break, + } + } + + Ok(batches) + } + fn is_consumed(&self) -> bool { self.consumed } diff --git a/fluss-rust/crates/fluss/src/client/table/mod.rs b/fluss-rust/crates/fluss/src/client/table/mod.rs index e2cf9e6d5e..26341d70a6 100644 --- a/fluss-rust/crates/fluss/src/client/table/mod.rs +++ b/fluss-rust/crates/fluss/src/client/table/mod.rs @@ -32,7 +32,7 @@ mod scanner; mod writer; pub use append::{AppendWriter, TableAppend}; -pub use scanner::{LogScanner, TableScan}; +pub use scanner::{LogScanner, RecordBatchLogScanner, TableScan}; #[allow(dead_code)] pub struct FlussTable<'a> { diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index 4255bb6841..7d22324d6b 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -15,6 +15,16 @@ // specific language governing permissions and limitations // under the License. +use arrow::array::RecordBatch; +use arrow_schema::SchemaRef; +use log::{debug, error, warn}; +use parking_lot::{Mutex, RwLock}; +use std::collections::{HashMap, HashSet}; +use std::slice::from_ref; +use std::sync::Arc; +use std::time::Duration; +use tempfile::TempDir; + use crate::client::connection::FlussConnection; use crate::client::credentials::CredentialsCache; use crate::client::metadata::Metadata; @@ -30,14 +40,6 @@ use crate::proto::{FetchLogRequest, PbFetchLogReqForBucket, PbFetchLogReqForTabl use crate::record::{LogRecordsBatches, ReadContext, ScanRecord, ScanRecords, to_arrow_schema}; use crate::rpc::{RpcClient, message}; use crate::util::FairBucketStatusMap; -use arrow_schema::SchemaRef; -use log::{debug, error, warn}; -use parking_lot::{Mutex, RwLock}; -use std::collections::{HashMap, HashSet}; -use std::slice::from_ref; -use std::sync::Arc; -use std::time::Duration; -use tempfile::TempDir; const LOG_FETCH_MAX_BYTES: i32 = 16 * 1024 * 1024; #[allow(dead_code)] @@ -216,16 +218,48 @@ impl<'a> TableScan<'a> { } pub fn create_log_scanner(self) -> Result { - LogScanner::new( + let inner = LogScannerInner::new( &self.table_info, self.metadata.clone(), self.conn.get_connections(), self.projected_fields, - ) + )?; + Ok(LogScanner { + inner: Arc::new(inner), + }) + } + + pub fn create_record_batch_log_scanner(self) -> Result { + let inner = LogScannerInner::new( + &self.table_info, + self.metadata.clone(), + self.conn.get_connections(), + self.projected_fields, + )?; + Ok(RecordBatchLogScanner { + inner: Arc::new(inner), + }) } } +/// Scanner for reading log records one at a time with per-record metadata. +/// +/// Use this scanner when you need access to individual record offsets and timestamps. +/// For batch-level access, use [`RecordBatchLogScanner`] instead. pub struct LogScanner { + inner: Arc, +} + +/// Scanner for reading log data as Arrow RecordBatches. +/// +/// More efficient than [`LogScanner`] for batch-level analytics where per-record +/// metadata (offsets, timestamps) is not needed. +pub struct RecordBatchLogScanner { + inner: Arc, +} + +/// Private shared implementation for both scanner types +struct LogScannerInner { table_path: TablePath, table_id: i64, metadata: Arc, @@ -233,8 +267,8 @@ pub struct LogScanner { log_fetcher: LogFetcher, } -impl LogScanner { - pub fn new( +impl LogScannerInner { + fn new( table_info: &TableInfo, metadata: Arc, connections: Arc, @@ -256,7 +290,7 @@ impl LogScanner { }) } - pub async fn poll(&self, timeout: Duration) -> Result { + async fn poll_records(&self, timeout: Duration) -> Result { let start = std::time::Instant::now(); let deadline = start + timeout; @@ -295,7 +329,7 @@ impl LogScanner { } } - pub async fn subscribe(&self, bucket: i32, offset: i64) -> Result<()> { + async fn subscribe(&self, bucket: i32, offset: i64) -> Result<()> { let table_bucket = TableBucket::new(self.table_id, bucket); self.metadata .check_and_update_table_metadata(from_ref(&self.table_path)) @@ -305,7 +339,7 @@ impl LogScanner { Ok(()) } - pub async fn subscribe_batch(&self, bucket_offsets: &HashMap) -> Result<()> { + async fn subscribe_batch(&self, bucket_offsets: &HashMap) -> Result<()> { self.metadata .check_and_update_table_metadata(from_ref(&self.table_path)) .await?; @@ -339,6 +373,76 @@ impl LogScanner { // Collect completed fetches from buffer self.log_fetcher.collect_fetches() } + + async fn poll_batches(&self, timeout: Duration) -> Result> { + let start = std::time::Instant::now(); + let deadline = start + timeout; + + loop { + let batches = self.poll_for_batches().await?; + + if !batches.is_empty() { + self.log_fetcher.send_fetches().await?; + return Ok(batches); + } + + let now = std::time::Instant::now(); + if now >= deadline { + return Ok(Vec::new()); + } + + let remaining = deadline - now; + let has_data = self + .log_fetcher + .log_fetch_buffer + .await_not_empty(remaining) + .await; + + if !has_data { + return Ok(Vec::new()); + } + } + } + + async fn poll_for_batches(&self) -> Result> { + let result = self.log_fetcher.collect_batches()?; + if !result.is_empty() { + return Ok(result); + } + + self.log_fetcher.send_fetches().await?; + self.log_fetcher.collect_batches() + } +} + +// Implementation for LogScanner (records mode) +impl LogScanner { + pub async fn poll(&self, timeout: Duration) -> Result { + self.inner.poll_records(timeout).await + } + + pub async fn subscribe(&self, bucket: i32, offset: i64) -> Result<()> { + self.inner.subscribe(bucket, offset).await + } + + pub async fn subscribe_batch(&self, bucket_offsets: &HashMap) -> Result<()> { + self.inner.subscribe_batch(bucket_offsets).await + } +} + +// Implementation for RecordBatchLogScanner (batches mode) +impl RecordBatchLogScanner { + pub async fn poll(&self, timeout: Duration) -> Result> { + self.inner.poll_batches(timeout).await + } + + pub async fn subscribe(&self, bucket: i32, offset: i64) -> Result<()> { + self.inner.subscribe(bucket, offset).await + } + + pub async fn subscribe_batch(&self, bucket_offsets: &HashMap) -> Result<()> { + self.inner.subscribe_batch(bucket_offsets).await + } } struct LogFetcher { @@ -801,6 +905,108 @@ impl LogFetcher { } } + /// Collect completed fetches as RecordBatches + fn collect_batches(&self) -> Result> { + // Limit memory usage with both batch count and byte size constraints. + // Max 100 batches per poll, but also check total bytes (soft cap ~64MB). + const MAX_BATCHES: usize = 100; + const MAX_BYTES: usize = 64 * 1024 * 1024; // 64MB soft cap + let mut result: Vec = Vec::new(); + let mut batches_remaining = MAX_BATCHES; + let mut bytes_consumed: usize = 0; + + while batches_remaining > 0 && bytes_consumed < MAX_BYTES { + let next_in_line = self.log_fetch_buffer.next_in_line_fetch(); + + match next_in_line { + Some(mut next_fetch) if !next_fetch.is_consumed() => { + let batches = + self.fetch_batches_from_fetch(&mut next_fetch, batches_remaining)?; + let batch_count = batches.len(); + + if !batches.is_empty() { + // Track bytes consumed (soft cap - may exceed by one fetch) + let batch_bytes: usize = + batches.iter().map(|b| b.get_array_memory_size()).sum(); + bytes_consumed += batch_bytes; + + result.extend(batches); + batches_remaining = batches_remaining.saturating_sub(batch_count); + } + + if !next_fetch.is_consumed() { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(next_fetch)); + } + } + _ => { + if let Some(completed_fetch) = self.log_fetch_buffer.poll() { + if !completed_fetch.is_initialized() { + let size_in_bytes = completed_fetch.size_in_bytes(); + match self.initialize_fetch(completed_fetch) { + Ok(initialized) => { + self.log_fetch_buffer.set_next_in_line_fetch(initialized); + continue; + } + Err(e) => { + if result.is_empty() && size_in_bytes == 0 { + continue; + } + return Err(e); + } + } + } else { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(completed_fetch)); + } + } else { + break; + } + } + } + } + + Ok(result) + } + + fn fetch_batches_from_fetch( + &self, + next_in_line_fetch: &mut Box, + max_batches: usize, + ) -> Result> { + let table_bucket = next_in_line_fetch.table_bucket().clone(); + let current_offset = self.log_scanner_status.get_bucket_offset(&table_bucket); + + if current_offset.is_none() { + warn!( + "Ignoring fetched batches for {table_bucket:?} since the bucket has been unsubscribed" + ); + next_in_line_fetch.drain(); + return Ok(Vec::new()); + } + + let current_offset = current_offset.unwrap(); + let fetch_offset = next_in_line_fetch.next_fetch_offset(); + + if fetch_offset == current_offset { + let batches = next_in_line_fetch.fetch_batches(max_batches)?; + let next_fetch_offset = next_in_line_fetch.next_fetch_offset(); + + if next_fetch_offset > current_offset { + self.log_scanner_status + .update_offset(&table_bucket, next_fetch_offset); + } + + Ok(batches) + } else { + warn!( + "Ignoring fetched batches for {table_bucket:?} at offset {fetch_offset} since the current offset is {current_offset}" + ); + next_in_line_fetch.drain(); + Ok(Vec::new()) + } + } + async fn prepare_fetch_log_requests(&self) -> HashMap { let mut fetch_log_req_for_buckets = HashMap::new(); let mut table_id = None; diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index 5a5115edfa..89fb7b9c94 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -546,6 +546,28 @@ impl LogRecordBatch { }; Ok(log_record_iterator) } + + /// Returns the record batch directly without creating an iterator. + /// This is more efficient when you need the entire batch rather than + /// iterating row-by-row. + pub fn record_batch(&self, read_context: &ReadContext) -> Result { + if self.record_count() == 0 { + // Return empty batch with correct schema + return Ok(RecordBatch::new_empty(read_context.target_schema.clone())); + } + + let data = self.data.get(RECORDS_OFFSET..).ok_or_else(|| { + crate::error::Error::UnexpectedError { + message: format!( + "Corrupt log record batch: data length {} is less than RECORDS_OFFSET {}", + self.data.len(), + RECORDS_OFFSET + ), + source: None, + } + })?; + read_context.record_batch(data) + } } /// Parse an Arrow IPC message from a byte slice. diff --git a/fluss-rust/crates/fluss/tests/integration/table.rs b/fluss-rust/crates/fluss/tests/integration/table.rs index 0ac34c7635..4cba46993f 100644 --- a/fluss-rust/crates/fluss/tests/integration/table.rs +++ b/fluss-rust/crates/fluss/tests/integration/table.rs @@ -469,4 +469,137 @@ mod table_test { records.sort_by_key(|r| r.offset()); records } + + #[tokio::test] + async fn test_poll_batches() { + let cluster = get_fluss_cluster(); + let connection = cluster.get_fluss_connection().await; + let admin = connection.get_admin().await.expect("Failed to get admin"); + + let table_path = TablePath::new("fluss".to_string(), "test_poll_batches".to_string()); + let schema = Schema::builder() + .column("id", DataTypes::int()) + .column("name", DataTypes::string()) + .build() + .unwrap(); + + create_table( + &admin, + &table_path, + &TableDescriptor::builder().schema(schema).build().unwrap(), + ) + .await; + tokio::time::sleep(Duration::from_secs(1)).await; + + let table = connection.get_table(&table_path).await.unwrap(); + let scanner = table.new_scan().create_record_batch_log_scanner().unwrap(); + scanner.subscribe(0, 0).await.unwrap(); + + // Test 1: Empty table should return empty result + assert!( + scanner + .poll(Duration::from_millis(500)) + .await + .unwrap() + .is_empty() + ); + + let writer = table.new_append().unwrap().create_writer(); + writer + .append_arrow_batch( + record_batch!(("id", Int32, [1, 2]), ("name", Utf8, ["a", "b"])).unwrap(), + ) + .await + .unwrap(); + writer + .append_arrow_batch( + record_batch!(("id", Int32, [3, 4]), ("name", Utf8, ["c", "d"])).unwrap(), + ) + .await + .unwrap(); + writer + .append_arrow_batch( + record_batch!(("id", Int32, [5, 6]), ("name", Utf8, ["e", "f"])).unwrap(), + ) + .await + .unwrap(); + writer.flush().await.unwrap(); + + use arrow::array::Int32Array; + let batches = scanner.poll(Duration::from_secs(10)).await.unwrap(); + let mut all_ids: Vec = batches + .iter() + .flat_map(|b| { + (0..b.num_rows()).map(|i| { + b.column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(i) + }) + }) + .collect(); + + // Test 2: Order should be preserved across multiple batches + assert_eq!(all_ids, vec![1, 2, 3, 4, 5, 6]); + + writer + .append_arrow_batch( + record_batch!(("id", Int32, [7, 8]), ("name", Utf8, ["g", "h"])).unwrap(), + ) + .await + .unwrap(); + writer.flush().await.unwrap(); + + let more = scanner.poll(Duration::from_secs(10)).await.unwrap(); + let new_ids: Vec = more + .iter() + .flat_map(|b| { + (0..b.num_rows()).map(|i| { + b.column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(i) + }) + }) + .collect(); + + // Test 3: Subsequent polls should not return duplicate data (offset continuation) + assert_eq!(new_ids, vec![7, 8]); + + // Test 4: Subscribing from mid-offset should truncate batch (Arrow batch slicing) + // Server returns all records from start of batch, but client truncates to subscription offset + let trunc_scanner = table.new_scan().create_record_batch_log_scanner().unwrap(); + trunc_scanner.subscribe(0, 3).await.unwrap(); + let trunc_batches = trunc_scanner.poll(Duration::from_secs(10)).await.unwrap(); + let trunc_ids: Vec = trunc_batches + .iter() + .flat_map(|b| { + (0..b.num_rows()).map(|i| { + b.column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(i) + }) + }) + .collect(); + + // Subscribing from offset 3 should return [4,5,6,7,8], not [1,2,3,4,5,6,7,8] + assert_eq!(trunc_ids, vec![4, 5, 6, 7, 8]); + + // Test 5: Projection should only return requested columns + let proj = table + .new_scan() + .project_by_name(&["id"]) + .unwrap() + .create_record_batch_log_scanner() + .unwrap(); + proj.subscribe(0, 0).await.unwrap(); + let proj_batches = proj.poll(Duration::from_secs(10)).await.unwrap(); + + // Projected batch should have 1 column (id), not 2 (id, name) + assert_eq!(proj_batches[0].num_columns(), 1); + } } From d61086fa8457e521dd949f4648ddbbc9c4ad6c88 Mon Sep 17 00:00:00 2001 From: Kelvin Wu Date: Sat, 10 Jan 2026 22:01:24 +0800 Subject: [PATCH 056/287] feat: introduce CompactedRowReader, CompactedRow, CompactedRowDeserializer (#131) --- .../crates/fluss/src/metadata/datatype.rs | 2 +- .../fluss/src/row/binary/binary_writer.rs | 2 +- .../fluss/src/row/compacted/compacted_row.rs | 260 ++++++++++++++++++ .../src/row/compacted/compacted_row_reader.rs | 218 +++++++++++++++ .../src/row/compacted/compacted_row_writer.rs | 5 +- .../crates/fluss/src/row/compacted/mod.rs | 9 + fluss-rust/crates/fluss/src/row/datum.rs | 10 +- .../src/row/encode/compacted_key_encoder.rs | 10 +- 8 files changed, 502 insertions(+), 14 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs create mode 100644 fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs diff --git a/fluss-rust/crates/fluss/src/metadata/datatype.rs b/fluss-rust/crates/fluss/src/metadata/datatype.rs index e5ccb9a8e9..c53cd273cb 100644 --- a/fluss-rust/crates/fluss/src/metadata/datatype.rs +++ b/fluss-rust/crates/fluss/src/metadata/datatype.rs @@ -861,7 +861,7 @@ impl RowType { pub fn with_data_types(data_types: Vec) -> Self { let mut fields: Vec = Vec::new(); data_types.iter().enumerate().for_each(|(idx, data_type)| { - fields.push(DataField::new(format!("f{}", idx), data_type.clone(), None)); + fields.push(DataField::new(format!("f{idx}"), data_type.clone(), None)); }); Self::with_nullable(true, fields) diff --git a/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs b/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs index a296777a30..44f10b6309 100644 --- a/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs +++ b/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs @@ -201,7 +201,7 @@ impl InnerValueWriter { } _ => { return Err(IllegalArgument { - message: format!("{:?} used to write value {:?}", self, value), + message: format!("{self:?} used to write value {value:?}"), }); } } diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs new file mode 100644 index 0000000000..fca41c655f --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs @@ -0,0 +1,260 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use bytes::Bytes; + +use crate::metadata::DataType; +use crate::row::compacted::compacted_row_reader::{CompactedRowDeserializer, CompactedRowReader}; +use crate::row::{GenericRow, InternalRow}; + +// Reference implementation: +// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/row/compacted/CompactedRow.java +#[allow(dead_code)] +pub struct CompactedRow { + arity: usize, + segment: Bytes, + offset: usize, + size_in_bytes: usize, + decoded: bool, + decoded_row: GenericRow<'static>, + reader: CompactedRowReader, + deserializer: CompactedRowDeserializer, +} + +#[allow(dead_code)] +impl CompactedRow { + pub fn calculate_bit_set_width_in_bytes(arity: usize) -> usize { + arity.div_ceil(8) + } + + pub fn new(types: Vec) -> Self { + let arity = types.len(); + Self { + arity, + segment: Bytes::new(), + offset: 0, + size_in_bytes: 0, + decoded: false, + decoded_row: GenericRow::new(), + reader: CompactedRowReader::new(arity), + deserializer: CompactedRowDeserializer::new(types), + } + } + + pub fn from_bytes(types: Vec, data: Bytes) -> Self { + let arity = types.len(); + let size = data.len(); + Self { + arity, + segment: data, + offset: 0, + size_in_bytes: size, + decoded: false, + decoded_row: GenericRow::new(), + reader: CompactedRowReader::new(arity), + deserializer: CompactedRowDeserializer::new(types), + } + } + + pub fn point_to(&mut self, segment: Bytes, offset: usize, size_in_bytes: usize) { + self.segment = segment; + self.offset = offset; + self.size_in_bytes = size_in_bytes; + self.decoded = false; + } + + pub fn get_segment(&self) -> &Bytes { + &self.segment + } + + pub fn get_offset(&self) -> usize { + self.offset + } + + pub fn get_size_in_bytes(&self) -> usize { + self.size_in_bytes + } + + pub fn get_field_count(&self) -> usize { + self.arity + } + + pub fn is_null_at(&self, pos: usize) -> bool { + let byte_index = pos >> 3; + let bit = pos & 7; + let idx = self.offset + byte_index; + (self.segment[idx] & (1u8 << bit)) != 0 + } + + fn decoded_row(&mut self) -> &GenericRow<'static> { + if !self.decoded { + self.reader + .point_to(self.segment.clone(), self.offset, self.size_in_bytes); + self.decoded_row = self.deserializer.deserialize(&mut self.reader); + self.decoded = true; + } + &self.decoded_row + } + + pub fn get_boolean(&mut self, pos: usize) -> bool { + self.decoded_row().get_boolean(pos) + } + + pub fn get_byte(&mut self, pos: usize) -> i8 { + self.decoded_row().get_byte(pos) + } + + pub fn get_short(&mut self, pos: usize) -> i16 { + self.decoded_row().get_short(pos) + } + + pub fn get_int(&mut self, pos: usize) -> i32 { + self.decoded_row().get_int(pos) + } + + pub fn get_long(&mut self, pos: usize) -> i64 { + self.decoded_row().get_long(pos) + } + + pub fn get_float(&mut self, pos: usize) -> f32 { + self.decoded_row().get_float(pos) + } + + pub fn get_double(&mut self, pos: usize) -> f64 { + self.decoded_row().get_double(pos) + } + + pub fn get_string(&mut self, pos: usize) -> &str { + self.decoded_row().get_string(pos) + } + + pub fn get_bytes(&mut self, pos: usize) -> &[u8] { + self.decoded_row().get_bytes(pos) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{ + BigIntType, BooleanType, BytesType, DoubleType, FloatType, IntType, SmallIntType, + StringType, TinyIntType, + }; + use crate::row::compacted::compacted_row_writer::CompactedRowWriter; + + #[test] + fn test_compacted_row() { + // Test all primitive types + let types = vec![ + DataType::Boolean(BooleanType::new()), + DataType::TinyInt(TinyIntType::new()), + DataType::SmallInt(SmallIntType::new()), + DataType::Int(IntType::new()), + DataType::BigInt(BigIntType::new()), + DataType::Float(FloatType::new()), + DataType::Double(DoubleType::new()), + DataType::String(StringType::new()), + DataType::Bytes(BytesType::new()), + ]; + + let mut row = CompactedRow::new(types.clone()); + let mut writer = CompactedRowWriter::new(types.len()); + + writer.write_boolean(true); + writer.write_byte(1); + writer.write_short(100); + writer.write_int(1000); + writer.write_long(10000); + writer.write_float(1.5); + writer.write_double(2.5); + writer.write_string("Hello World"); + writer.write_bytes(&[1, 2, 3, 4, 5]); + + row.point_to(writer.to_bytes(), 0, writer.position()); + + assert_eq!(row.get_field_count(), 9); + assert!(row.get_boolean(0)); + assert_eq!(row.get_byte(1), 1); + assert_eq!(row.get_short(2), 100); + assert_eq!(row.get_int(3), 1000); + assert_eq!(row.get_long(4), 10000); + assert_eq!(row.get_float(5), 1.5); + assert_eq!(row.get_double(6), 2.5); + assert_eq!(row.get_string(7), "Hello World"); + assert_eq!(row.get_bytes(8), &[1, 2, 3, 4, 5]); + + // Test with nulls + let types = vec![ + DataType::Int(IntType::new()), + DataType::String(StringType::new()), + DataType::Double(DoubleType::new()), + ]; + + let mut row = CompactedRow::new(types.clone()); + let mut writer = CompactedRowWriter::new(types.len()); + + writer.write_int(100); + writer.set_null_at(1); + writer.write_double(2.71); + + row.point_to(writer.to_bytes(), 0, writer.position()); + + assert!(!row.is_null_at(0)); + assert!(row.is_null_at(1)); + assert!(!row.is_null_at(2)); + assert_eq!(row.get_int(0), 100); + assert_eq!(row.get_double(2), 2.71); + + // Test multiple reads (caching) + assert_eq!(row.get_int(0), 100); + assert_eq!(row.get_int(0), 100); + + // Test from_bytes + let types = vec![ + DataType::Int(IntType::new()), + DataType::String(StringType::new()), + ]; + + let mut writer = CompactedRowWriter::new(types.len()); + writer.write_int(42); + writer.write_string("test"); + + let mut row = CompactedRow::from_bytes(types, writer.to_bytes()); + + assert_eq!(row.get_int(0), 42); + assert_eq!(row.get_string(1), "test"); + + // Test large row + let num_fields = 100; + let types: Vec = (0..num_fields) + .map(|_| DataType::Int(IntType::new())) + .collect(); + + let mut row = CompactedRow::new(types.clone()); + let mut writer = CompactedRowWriter::new(num_fields); + + for i in 0..num_fields { + writer.write_int((i * 10) as i32); + } + + row.point_to(writer.to_bytes(), 0, writer.position()); + + for i in 0..num_fields { + assert_eq!(row.get_int(i), (i * 10) as i32); + } + } +} diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs new file mode 100644 index 0000000000..19afe887d4 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs @@ -0,0 +1,218 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use bytes::Bytes; + +use crate::{ + metadata::DataType, + row::{ + Datum, GenericRow, + compacted::{compacted_row::CompactedRow, compacted_row_writer::CompactedRowWriter}, + }, +}; + +#[allow(dead_code)] +pub struct CompactedRowDeserializer { + schema: Vec, +} + +#[allow(dead_code)] +impl CompactedRowDeserializer { + pub fn new(schema: Vec) -> Self { + Self { schema } + } + + pub fn deserialize(&self, reader: &mut CompactedRowReader) -> GenericRow<'static> { + let mut row = GenericRow::new(); + for (pos, dtype) in self.schema.iter().enumerate() { + if reader.is_null_at(pos) { + row.set_field(pos, Datum::Null); + continue; + } + let datum = match dtype { + DataType::Boolean(_) => Datum::Bool(reader.read_boolean()), + DataType::TinyInt(_) => Datum::Int8(reader.read_byte() as i8), + DataType::SmallInt(_) => Datum::Int16(reader.read_short()), + DataType::Int(_) => Datum::Int32(reader.read_int()), + DataType::BigInt(_) => Datum::Int64(reader.read_long()), + DataType::Float(_) => Datum::Float32(reader.read_float().into()), + DataType::Double(_) => Datum::Float64(reader.read_double().into()), + // TODO: use read_char(length) in the future, but need to keep compatibility + DataType::Char(_) | DataType::String(_) => Datum::OwnedString(reader.read_string()), + // TODO: use read_binary(length) in the future, but need to keep compatibility + DataType::Bytes(_) | DataType::Binary(_) => { + Datum::Blob(reader.read_bytes().into_vec().into()) + } + _ => panic!("unsupported DataType in CompactedRowDeserializer"), + }; + row.set_field(pos, datum); + } + row + } +} + +// Reference implementation: +// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/row/compacted/CompactedRowReader.java +#[allow(dead_code)] +pub struct CompactedRowReader { + segment: Bytes, + offset: usize, + position: usize, + limit: usize, + header_size_in_bytes: usize, +} + +#[allow(dead_code)] +impl CompactedRowReader { + pub fn new(field_count: usize) -> Self { + let header = CompactedRow::calculate_bit_set_width_in_bytes(field_count); + Self { + header_size_in_bytes: header, + segment: Bytes::new(), + offset: 0, + position: 0, + limit: 0, + } + } + + pub fn point_to(&mut self, data: Bytes, offset: usize, length: usize) { + let limit = offset + length; + let position = offset + self.header_size_in_bytes; + + debug_assert!(limit <= data.len()); + debug_assert!(position <= limit); + + self.segment = data; + self.offset = offset; + self.position = position; + self.limit = limit; + } + + pub fn is_null_at(&self, pos: usize) -> bool { + let byte_index = pos >> 3; + let bit = pos & 7; + debug_assert!(byte_index < self.header_size_in_bytes); + let idx = self.offset + byte_index; + (self.segment[idx] & (1u8 << bit)) != 0 + } + + pub fn read_boolean(&mut self) -> bool { + self.read_byte() != 0 + } + + pub fn read_byte(&mut self) -> u8 { + debug_assert!(self.position < self.limit); + let b = self.segment[self.position]; + self.position += 1; + b + } + + pub fn read_short(&mut self) -> i16 { + debug_assert!(self.position + 2 <= self.limit); + let bytes_slice = &self.segment[self.position..self.position + 2]; + let byte_array: [u8; 2] = bytes_slice + .try_into() + .expect("Slice must be exactly 2 bytes long"); + + self.position += 2; + i16::from_ne_bytes(byte_array) + } + + pub fn read_int(&mut self) -> i32 { + let mut result: u32 = 0; + let mut shift = 0; + + for _ in 0..CompactedRowWriter::MAX_INT_SIZE { + let b = self.read_byte(); + result |= ((b & 0x7F) as u32) << shift; + if (b & 0x80) == 0 { + return result as i32; + } + shift += 7; + } + + panic!("Invalid input stream."); + } + + pub fn read_long(&mut self) -> i64 { + let mut result: u64 = 0; + let mut shift = 0; + + for _ in 0..CompactedRowWriter::MAX_LONG_SIZE { + let b = self.read_byte(); + result |= ((b & 0x7F) as u64) << shift; + if (b & 0x80) == 0 { + return result as i64; + } + shift += 7; + } + + panic!("Invalid input stream."); + } + + pub fn read_float(&mut self) -> f32 { + debug_assert!(self.position + 4 <= self.limit); + let bytes_slice = &self.segment[self.position..self.position + 4]; + let byte_array: [u8; 4] = bytes_slice + .try_into() + .expect("Slice must be exactly 4 bytes long"); + + self.position += 4; + f32::from_ne_bytes(byte_array) + } + + pub fn read_double(&mut self) -> f64 { + debug_assert!(self.position + 8 <= self.limit); + let bytes_slice = &self.segment[self.position..self.position + 8]; + let byte_array: [u8; 8] = bytes_slice + .try_into() + .expect("Slice must be exactly 8 bytes long"); + + self.position += 8; + f64::from_ne_bytes(byte_array) + } + + pub fn read_binary(&mut self, length: usize) -> Bytes { + debug_assert!(self.position + length <= self.limit); + + let start = self.position; + let end = start + length; + self.position = end; + + self.segment.slice(start..end) + } + + pub fn read_bytes(&mut self) -> Box<[u8]> { + let len = self.read_int(); + debug_assert!(len >= 0); + + let len = len as usize; + debug_assert!(self.position + len <= self.limit); + + let start = self.position; + let end = start + len; + self.position = end; + + self.segment[start..end].to_vec().into_boxed_slice() + } + + pub fn read_string(&mut self) -> String { + let bytes = self.read_bytes(); + String::from_utf8(bytes.into_vec()) + .unwrap_or_else(|e| panic!("Invalid UTF-8 in string data: {e}")) + } +} diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs index 2debab19ae..834512350c 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs @@ -18,6 +18,8 @@ use bytes::{Bytes, BytesMut}; use std::cmp; +use crate::row::compacted::compacted_row::CompactedRow; + // Writer for CompactedRow // Reference implementation: // https://github.com/apache/fluss/blob/d4a72fad240d4b81563aaf83fa3b09b5058674ed/fluss-common/src/main/java/org/apache/fluss/row/compacted/CompactedRowWriter.java#L71 @@ -34,8 +36,7 @@ impl CompactedRowWriter { pub const MAX_LONG_SIZE: usize = 10; pub fn new(field_count: usize) -> Self { - // bitset width in bytes, it should be in CompactedRow - let header_size = field_count.div_ceil(8); + let header_size = CompactedRow::calculate_bit_set_width_in_bytes(field_count); let cap = cmp::max(64, header_size); let mut buffer = BytesMut::with_capacity(cap); diff --git a/fluss-rust/crates/fluss/src/row/compacted/mod.rs b/fluss-rust/crates/fluss/src/row/compacted/mod.rs index c81eb5a50b..3361078321 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/mod.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/mod.rs @@ -16,6 +16,15 @@ // under the License. mod compacted_key_writer; + +mod compacted_row; +mod compacted_row_reader; mod compacted_row_writer; pub use compacted_key_writer::CompactedKeyWriter; +#[allow(unused_imports)] +pub use compacted_row::CompactedRow; +#[allow(unused_imports)] +pub use compacted_row_reader::{CompactedRowDeserializer, CompactedRowReader}; +#[allow(unused_imports)] +pub use compacted_row_writer::CompactedRowWriter; diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs index 28a378fd56..78872a9dd4 100644 --- a/fluss-rust/crates/fluss/src/row/datum.rs +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -53,6 +53,9 @@ pub enum Datum<'a> { Float64(F64), #[display("'{0}'")] String(&'a str), + /// Owned string + #[display("'{0}'")] + OwnedString(String), #[display("{0}")] Blob(Blob), #[display("{:?}")] @@ -75,6 +78,7 @@ impl Datum<'_> { pub fn as_str(&self) -> &str { match self { Self::String(s) => s, + Self::OwnedString(s) => s.as_str(), _ => panic!("not a string: {self:?}"), } } @@ -216,13 +220,14 @@ impl TryFrom<&Datum<'_>> for bool { } } -impl<'a> TryFrom<&Datum<'a>> for &'a str { +impl<'b, 'a: 'b> TryFrom<&'b Datum<'a>> for &'b str { type Error = (); #[inline] - fn try_from(from: &Datum<'a>) -> std::result::Result { + fn try_from(from: &'b Datum<'a>) -> std::result::Result { match from { Datum::String(i) => Ok(*i), + Datum::OwnedString(s) => Ok(s.as_str()), _ => Err(()), } } @@ -291,6 +296,7 @@ impl Datum<'_> { Datum::Float32(v) => append_value_to_arrow!(Float32Builder, v.into_inner()), Datum::Float64(v) => append_value_to_arrow!(Float64Builder, v.into_inner()), Datum::String(v) => append_value_to_arrow!(StringBuilder, *v), + Datum::OwnedString(v) => append_value_to_arrow!(StringBuilder, v.as_str()), Datum::Blob(v) => append_value_to_arrow!(BinaryBuilder, v.as_ref()), Datum::BorrowedBlob(v) => append_value_to_arrow!(BinaryBuilder, *v), Datum::Decimal(_) | Datum::Date(_) | Datum::Timestamp(_) | Datum::TimestampTz(_) => { diff --git a/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs b/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs index b9335a3c13..ebe3da2a0a 100644 --- a/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs +++ b/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs @@ -49,10 +49,7 @@ impl CompactedKeyEncoder { Some(idx) => encode_col_indexes.push(idx), None => { return Err(IllegalArgument { - message: format!( - "Field {:?} not found in input row type {:?}", - key, row_type - ), + message: format!("Field {key:?} not found in input row type {row_type:?}"), }); } } @@ -89,10 +86,7 @@ impl KeyEncoder for CompactedKeyEncoder { match &field_getter.get_field(row) { Datum::Null => { return Err(IllegalArgument { - message: format!( - "Cannot encode key with null value at position: {:?}", - pos - ), + message: format!("Cannot encode key with null value at position: {pos:?}"), }); } value => self.field_encoders.get(pos).unwrap().write_value( From 98497bef8adbb2cbd7a2a6a34b6af4469678653e Mon Sep 17 00:00:00 2001 From: Anton Borisov <148864665+fresh-borzoni@users.noreply.github.com> Date: Sat, 10 Jan 2026 16:04:31 +0000 Subject: [PATCH 057/287] chore: fix Box leaking and batch API refactor (#136) --- fluss-rust/bindings/python/src/table.rs | 100 +++--------------------- 1 file changed, 11 insertions(+), 89 deletions(-) diff --git a/fluss-rust/bindings/python/src/table.rs b/fluss-rust/bindings/python/src/table.rs index 71759d7505..8a1164856b 100644 --- a/fluss-rust/bindings/python/src/table.rs +++ b/fluss-rust/bindings/python/src/table.rs @@ -17,6 +17,8 @@ use crate::TOKIO_RUNTIME; use crate::*; +use arrow::array::RecordBatch; +use arrow_pyarrow::FromPyArrow; use fluss::client::EARLIEST_OFFSET; use fluss::rpc::message::OffsetSpec; use pyo3_async_runtimes::tokio::future_into_py; @@ -148,34 +150,17 @@ impl AppendWriter { /// Write Arrow batch data pub fn write_arrow_batch(&mut self, py: Python, batch: Py) -> PyResult<()> { - // Extract number of rows and columns from the Arrow batch - let num_rows: usize = batch.getattr(py, "num_rows")?.extract(py)?; - let num_columns: usize = batch.getattr(py, "num_columns")?.extract(py)?; - - // Process each row in the batch - for row_idx in 0..num_rows { - let mut generic_row = fcore::row::GenericRow::new(); - - // Extract values for each column in this row - for col_idx in 0..num_columns { - let column = batch.call_method1(py, "column", (col_idx,))?; - let value = column.call_method1(py, "__getitem__", (row_idx,))?; - - // Convert the Python value to a Datum and add to the row - let datum = self.convert_python_value_to_datum(py, value)?; - generic_row.set_field(col_idx, datum); - } + // This shares the underlying Arrow buffers without copying data + let batch_bound = batch.bind(py); + let rust_batch: RecordBatch = FromPyArrow::from_pyarrow_bound(batch_bound) + .map_err(|e| FlussError::new_err(format!("Failed to convert RecordBatch: {e}")))?; - // Append this row using the async append method - TOKIO_RUNTIME.block_on(async { - self.inner - .append(generic_row) - .await - .map_err(|e| FlussError::new_err(e.to_string())) - })?; - } + // Release the GIL before blocking on async operation + let result = py.detach(|| { + TOKIO_RUNTIME.block_on(async { self.inner.append_arrow_batch(rust_batch).await }) + }); - Ok(()) + result.map_err(|e| FlussError::new_err(e.to_string())) } /// Write Pandas DataFrame data @@ -213,69 +198,6 @@ impl AppendWriter { pub fn from_core(append: fcore::client::AppendWriter) -> Self { Self { inner: append } } - - fn convert_python_value_to_datum( - &self, - py: Python, - value: Py, - ) -> PyResult> { - use fcore::row::{Blob, Datum, F32, F64}; - - // Check for None (null) - if value.is_none(py) { - return Ok(Datum::Null); - } - - // Try to extract different types - if let Ok(type_name) = value.bind(py).get_type().name() { - if type_name == "StringScalar" { - if let Ok(py_value) = value.call_method0(py, "as_py") { - if let Ok(str_val) = py_value.extract::(py) { - let leaked_str: &'static str = Box::leak(str_val.into_boxed_str()); - return Ok(Datum::String(leaked_str)); - } - } - } - } - - if let Ok(bool_val) = value.extract::(py) { - return Ok(Datum::Bool(bool_val)); - } - - if let Ok(int_val) = value.extract::(py) { - return Ok(Datum::Int32(int_val)); - } - - if let Ok(int_val) = value.extract::(py) { - return Ok(Datum::Int64(int_val)); - } - - if let Ok(float_val) = value.extract::(py) { - return Ok(Datum::Float32(F32::from(float_val))); - } - - if let Ok(float_val) = value.extract::(py) { - return Ok(Datum::Float64(F64::from(float_val))); - } - - if let Ok(str_val) = value.extract::(py) { - // Convert String to &'static str by leaking memory - // This is a simplified approach - in production, you might want better lifetime management - let leaked_str: &'static str = Box::leak(str_val.into_boxed_str()); - return Ok(Datum::String(leaked_str)); - } - - if let Ok(bytes_val) = value.extract::>(py) { - let blob = Blob::from(bytes_val); - return Ok(Datum::Blob(blob)); - } - - // If we can't convert, return an error - let type_name = value.bind(py).get_type().name()?; - Err(FlussError::new_err(format!( - "Cannot convert Python value to Datum: {type_name:?}" - ))) - } } /// Scanner for reading log data from a Fluss table From 0f4f4ec62e4fe209fd8f3ecaca7de044d236eccd Mon Sep 17 00:00:00 2001 From: AlexZhao Date: Sun, 11 Jan 2026 10:51:55 +0800 Subject: [PATCH 058/287] chore: Improve write path error handling logic (#132) --- fluss-rust/crates/fluss/build.rs | 4 +- .../crates/fluss/src/client/credentials.rs | 51 ++ .../crates/fluss/src/client/metadata.rs | 44 +- .../fluss/src/client/write/accumulator.rs | 73 ++- .../crates/fluss/src/client/write/batch.rs | 87 +++- .../fluss/src/client/write/broadcast.rs | 4 + .../fluss/src/client/write/bucket_assigner.rs | 40 ++ .../crates/fluss/src/client/write/mod.rs | 18 +- .../crates/fluss/src/client/write/sender.rs | 441 ++++++++++++++++-- fluss-rust/crates/fluss/src/lib.rs | 3 + fluss-rust/crates/fluss/src/row/datum.rs | 66 +++ fluss-rust/crates/fluss/src/rpc/api_key.rs | 38 ++ .../crates/fluss/src/rpc/api_version.rs | 25 + fluss-rust/crates/fluss/src/rpc/convert.rs | 48 ++ .../crates/fluss/src/rpc/fluss_api_error.rs | 35 ++ .../fluss/src/rpc/message/produce_log.rs | 6 +- fluss-rust/crates/fluss/src/test_utils.rs | 88 ++++ 17 files changed, 1006 insertions(+), 65 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/test_utils.rs diff --git a/fluss-rust/crates/fluss/build.rs b/fluss-rust/crates/fluss/build.rs index a83cd056b5..1564313732 100644 --- a/fluss-rust/crates/fluss/build.rs +++ b/fluss-rust/crates/fluss/build.rs @@ -18,6 +18,8 @@ use std::io::Result; fn main() -> Result<()> { - prost_build::compile_protos(&["src/proto/fluss_api.proto"], &["src/proto"])?; + let mut config = prost_build::Config::new(); + config.bytes([".proto.PbProduceLogReqForBucket.records"]); + config.compile_protos(&["src/proto/fluss_api.proto"], &["src/proto"])?; Ok(()) } diff --git a/fluss-rust/crates/fluss/src/client/credentials.rs b/fluss-rust/crates/fluss/src/client/credentials.rs index ffb682ed7d..c520b4416d 100644 --- a/fluss-rust/crates/fluss/src/client/credentials.rs +++ b/fluss-rust/crates/fluss/src/client/credentials.rs @@ -156,3 +156,54 @@ impl CredentialsCache { Ok(props) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::metadata::Metadata; + use crate::cluster::Cluster; + + #[test] + fn convert_hadoop_key_to_opendal_maps_known_keys() { + let (key, invert) = convert_hadoop_key_to_opendal("fs.s3a.endpoint").expect("key"); + assert_eq!(key, "endpoint"); + assert!(!invert); + + let (key, invert) = convert_hadoop_key_to_opendal("fs.s3a.path.style.access").expect("key"); + assert_eq!(key, "enable_virtual_host_style"); + assert!(invert); + + assert!(convert_hadoop_key_to_opendal("fs.s3a.connection.ssl.enabled").is_none()); + assert!(convert_hadoop_key_to_opendal("unknown.key").is_none()); + } + + #[tokio::test] + async fn credentials_cache_returns_cached_props() -> Result<()> { + let cached = CachedToken { + access_key_id: "ak".to_string(), + secret_access_key: "sk".to_string(), + security_token: Some("token".to_string()), + addition_infos: HashMap::from([( + "fs.s3a.path.style.access".to_string(), + "true".to_string(), + )]), + cached_at: Instant::now(), + }; + + let cache = CredentialsCache { + inner: RwLock::new(Some(cached)), + rpc_client: Arc::new(RpcClient::new()), + metadata: Arc::new(Metadata::new_for_test(Arc::new(Cluster::default()))), + }; + + let props = cache.get_or_refresh().await?; + assert_eq!(props.get("access_key_id"), Some(&"ak".to_string())); + assert_eq!(props.get("secret_access_key"), Some(&"sk".to_string())); + assert_eq!(props.get("security_token"), Some(&"token".to_string())); + assert_eq!( + props.get("enable_virtual_host_style"), + Some(&"false".to_string()) + ); + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/client/metadata.rs b/fluss-rust/crates/fluss/src/client/metadata.rs index a51442254c..0e6f965131 100644 --- a/fluss-rust/crates/fluss/src/client/metadata.rs +++ b/fluss-rust/crates/fluss/src/client/metadata.rs @@ -135,7 +135,47 @@ impl Metadata { guard.clone() } - pub fn leader_for(&self, _table_bucket: &TableBucket) -> Option<&ServerNode> { - todo!() + pub fn leader_for(&self, table_bucket: &TableBucket) -> Option { + let cluster = self.cluster.read(); + cluster.leader_for(table_bucket).cloned() + } +} + +#[cfg(test)] +impl Metadata { + pub(crate) fn new_for_test(cluster: Arc) -> Self { + Metadata { + cluster: RwLock::new(cluster), + connections: Arc::new(RpcClient::new()), + bootstrap: Arc::from(""), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{TableBucket, TablePath}; + use crate::test_utils::build_cluster_arc; + + #[test] + fn leader_for_returns_server() { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Metadata::new_for_test(cluster); + let leader = metadata + .leader_for(&TableBucket::new(1, 0)) + .expect("leader"); + assert_eq!(leader.id(), 1); + } + + #[test] + fn invalidate_server_removes_leader() { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Metadata::new_for_test(cluster); + metadata.invalidate_server(&1, vec![1]); + let cluster = metadata.get_cluster(); + assert!(cluster.get_tablet_server(1).is_none()); } } diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs index beae0caacc..001d0aa7d0 100644 --- a/fluss-rust/crates/fluss/src/client/write/accumulator.rs +++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs @@ -250,7 +250,7 @@ impl RecordAccumulator { cluster: Arc, nodes: &HashSet, max_size: i32, - ) -> Result>>> { + ) -> Result>> { if nodes.is_empty() { return Ok(HashMap::new()); } @@ -272,7 +272,7 @@ impl RecordAccumulator { cluster: &Cluster, node: &ServerNode, max_size: i32, - ) -> Result>> { + ) -> Result> { let mut size = 0; let buckets = self.get_all_buckets_in_current_node(node, cluster); let mut ready = Vec::new(); @@ -324,10 +324,10 @@ impl RecordAccumulator { // mark the batch as drained. batch.drained(current_time_ms()); - ready.push(Arc::new(ReadyWriteBatch { + ready.push(ReadyWriteBatch { table_bucket, write_batch: batch, - })); + }); } } } @@ -342,6 +342,29 @@ impl RecordAccumulator { self.incomplete_batches.write().remove(&batch_id); } + pub async fn re_enqueue(&self, ready_write_batch: ReadyWriteBatch) { + ready_write_batch.write_batch.re_enqueued(); + let table_path = ready_write_batch.write_batch.table_path().clone(); + let bucket_id = ready_write_batch.table_bucket.bucket_id(); + let table_id = u64::try_from(ready_write_batch.table_bucket.table_id()).unwrap_or(0); + let mut binding = + self.write_batches + .entry(table_path) + .or_insert_with(|| BucketAndWriteBatches { + table_id, + is_partitioned_table: false, + partition_id: None, + batches: Default::default(), + }); + let bucket_and_batches = binding.value_mut(); + let dq = bucket_and_batches + .batches + .entry(bucket_id) + .or_insert_with(|| Mutex::new(VecDeque::new())); + let mut dq_guard = dq.lock().await; + dq_guard.push_front(ready_write_batch.write_batch); + } + fn get_all_buckets_in_current_node( &self, current: &ServerNode, @@ -446,3 +469,45 @@ impl ReadyCheckResult { } } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::TablePath; + use crate::row::{Datum, GenericRow}; + use crate::test_utils::build_cluster; + use std::sync::Arc; + + #[tokio::test] + async fn re_enqueue_increments_attempts() -> Result<()> { + let config = Config::default(); + let accumulator = RecordAccumulator::new(config); + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + let cluster = Arc::new(build_cluster(table_path.as_ref(), 1, 1)); + let record = WriteRecord::new( + table_path.clone(), + GenericRow { + values: vec![Datum::Int32(1)], + }, + ); + + accumulator.append(&record, 0, &cluster, false).await?; + + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + let mut batches = accumulator + .drain(cluster.clone(), &nodes, 1024 * 1024) + .await?; + let mut drained = batches.remove(&1).expect("drained batches"); + let batch = drained.pop().expect("batch"); + assert_eq!(batch.write_batch.attempts(), 0); + + accumulator.re_enqueue(batch).await; + + let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024).await?; + let mut drained = batches.remove(&1).expect("drained batches"); + let batch = drained.pop().expect("batch"); + assert_eq!(batch.write_batch.attempts(), 1); + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/batch.rs b/fluss-rust/crates/fluss/src/client/write/batch.rs index ba04db4ac6..1f54226f39 100644 --- a/fluss-rust/crates/fluss/src/client/write/batch.rs +++ b/fluss-rust/crates/fluss/src/client/write/batch.rs @@ -22,7 +22,10 @@ use crate::compression::ArrowCompressionInfo; use crate::error::Result; use crate::metadata::{DataType, TablePath}; use crate::record::MemoryLogRecordsArrowBuilder; +use bytes::Bytes; +use parking_lot::Mutex; use std::cmp::max; +use std::sync::atomic::{AtomicBool, AtomicI32, Ordering}; #[allow(dead_code)] pub struct InnerWriteBatch { @@ -31,7 +34,8 @@ pub struct InnerWriteBatch { create_ms: i64, bucket_id: BucketId, results: BroadcastOnce, - completed: bool, + completed: AtomicBool, + attempts: AtomicI32, drained_ms: i64, } @@ -43,7 +47,8 @@ impl InnerWriteBatch { create_ms, bucket_id, results: Default::default(), - completed: Default::default(), + completed: AtomicBool::new(false), + attempts: AtomicI32::new(0), drained_ms: -1, } } @@ -53,15 +58,36 @@ impl InnerWriteBatch { } fn complete(&self, write_result: BatchWriteResult) -> bool { - if !self.completed { - self.results.broadcast(write_result); + if self + .completed + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .is_err() + { + return false; } + self.results.broadcast(write_result); true } fn drained(&mut self, now_ms: i64) { self.drained_ms = max(self.drained_ms, now_ms); } + + fn table_path(&self) -> &TablePath { + &self.table_path + } + + fn attempts(&self) -> i32 { + self.attempts.load(Ordering::Acquire) + } + + fn re_enqueued(&self) { + self.attempts.fetch_add(1, Ordering::AcqRel); + } + + fn is_done(&self) -> bool { + self.completed.load(Ordering::Acquire) + } } pub enum WriteBatch { @@ -112,7 +138,7 @@ impl WriteBatch { } } - pub fn build(&self) -> Result> { + pub fn build(&self) -> Result { match self { WriteBatch::ArrowLog(batch) => batch.build(), } @@ -125,11 +151,28 @@ impl WriteBatch { pub fn batch_id(&self) -> i64 { self.inner_batch().batch_id } + + pub fn table_path(&self) -> &TablePath { + self.inner_batch().table_path() + } + + pub fn attempts(&self) -> i32 { + self.inner_batch().attempts() + } + + pub fn re_enqueued(&self) { + self.inner_batch().re_enqueued(); + } + + pub fn is_done(&self) -> bool { + self.inner_batch().is_done() + } } pub struct ArrowLogWriteBatch { pub write_batch: InnerWriteBatch, pub arrow_builder: MemoryLogRecordsArrowBuilder, + built_records: Mutex>, } impl ArrowLogWriteBatch { @@ -153,6 +196,7 @@ impl ArrowLogWriteBatch { to_append_record_batch, arrow_compression_info, ), + built_records: Mutex::new(None), } } @@ -174,8 +218,14 @@ impl ArrowLogWriteBatch { } } - pub fn build(&self) -> Result> { - self.arrow_builder.build() + pub fn build(&self) -> Result { + let mut cached = self.built_records.lock(); + if let Some(bytes) = cached.as_ref() { + return Ok(bytes.clone()); + } + let bytes = Bytes::from(self.arrow_builder.build()?); + *cached = Some(bytes.clone()); + Ok(bytes) } pub fn is_closed(&self) -> bool { @@ -186,3 +236,26 @@ impl ArrowLogWriteBatch { self.arrow_builder.close() } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::TablePath; + + #[test] + fn complete_only_once() { + let batch = + InnerWriteBatch::new(1, TablePath::new("db".to_string(), "tbl".to_string()), 0, 0); + assert!(batch.complete(Ok(()))); + assert!(!batch.complete(Err(crate::client::broadcast::Error::Dropped))); + } + + #[test] + fn attempts_increment_on_reenqueue() { + let batch = + InnerWriteBatch::new(1, TablePath::new("db".to_string(), "tbl".to_string()), 0, 0); + assert_eq!(batch.attempts(), 0); + batch.re_enqueued(); + assert_eq!(batch.attempts(), 1); + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/broadcast.rs b/fluss-rust/crates/fluss/src/client/write/broadcast.rs index d2e7f0c91c..ec45776599 100644 --- a/fluss-rust/crates/fluss/src/client/write/broadcast.rs +++ b/fluss-rust/crates/fluss/src/client/write/broadcast.rs @@ -28,6 +28,10 @@ pub type BatchWriteResult = Result<(), Error>; pub enum Error { #[error("BroadcastOnce dropped")] Dropped, + #[error("Write failed: {message} (code {code})")] + WriteFailed { code: i32, message: String }, + #[error("Write failed before request was sent: {message}")] + Client { message: String }, } #[derive(Debug, Clone)] diff --git a/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs b/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs index 44b2673697..2370719efe 100644 --- a/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs +++ b/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs @@ -146,3 +146,43 @@ impl BucketAssigner for HashBucketAssigner { self.bucketing_function.bucketing(key, self.num_buckets) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::bucketing::BucketingFunction; + use crate::cluster::Cluster; + use crate::metadata::TablePath; + use crate::test_utils::build_cluster; + + #[test] + fn sticky_bucket_assigner_picks_available_bucket() { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let cluster = build_cluster(&table_path, 1, 2); + let assigner = StickyBucketAssigner::new(table_path); + let bucket = assigner.assign_bucket(None, &cluster).expect("bucket"); + assert!((0..2).contains(&bucket)); + + assigner.on_new_batch(&cluster, bucket); + let next_bucket = assigner.assign_bucket(None, &cluster).expect("bucket"); + assert!((0..2).contains(&next_bucket)); + } + + #[test] + fn hash_bucket_assigner_requires_key() { + let assigner = HashBucketAssigner::new(3, ::of(None)); + let cluster = Cluster::default(); + let err = assigner.assign_bucket(None, &cluster).unwrap_err(); + assert!(matches!(err, crate::error::Error::IllegalArgument { .. })); + } + + #[test] + fn hash_bucket_assigner_hashes_key() { + let assigner = HashBucketAssigner::new(4, ::of(None)); + let cluster = Cluster::default(); + let bucket = assigner + .assign_bucket(Some(b"key"), &cluster) + .expect("bucket"); + assert!((0..4).contains(&bucket)); + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/mod.rs b/fluss-rust/crates/fluss/src/client/write/mod.rs index cd33586c89..d79418bfc2 100644 --- a/fluss-rust/crates/fluss/src/client/write/mod.rs +++ b/fluss-rust/crates/fluss/src/client/write/mod.rs @@ -18,7 +18,7 @@ mod accumulator; mod batch; -use crate::client::broadcast::{BatchWriteResult, BroadcastOnceReceiver}; +use crate::client::broadcast::{self as client_broadcast, BatchWriteResult, BroadcastOnceReceiver}; use crate::error::Error; use crate::metadata::TablePath; use crate::row::GenericRow; @@ -81,10 +81,18 @@ impl ResultHandle { } pub fn result(&self, batch_result: BatchWriteResult) -> Result<(), Error> { - // do nothing, just return empty result - batch_result.map_err(|e| Error::UnexpectedError { - message: format!("Fail to get write result {e:?}"), - source: None, + batch_result.map_err(|e| match e { + client_broadcast::Error::WriteFailed { code, message } => Error::FlussAPIError { + api_error: crate::rpc::ApiError { code, message }, + }, + client_broadcast::Error::Client { message } => Error::UnexpectedError { + message, + source: None, + }, + client_broadcast::Error::Dropped => Error::UnexpectedError { + message: "Fail to get write result because broadcast was dropped.".to_string(), + source: None, + }, }) } } diff --git a/fluss-rust/crates/fluss/src/client/write/sender.rs b/fluss-rust/crates/fluss/src/client/write/sender.rs index 462a846d6c..cb03a2c462 100644 --- a/fluss-rust/crates/fluss/src/client/write/sender.rs +++ b/fluss-rust/crates/fluss/src/client/write/sender.rs @@ -15,15 +15,16 @@ // specific language governing permissions and limitations // under the License. +use crate::client::broadcast; use crate::client::metadata::Metadata; use crate::client::{ReadyWriteBatch, RecordAccumulator}; -use crate::error::Error; -use crate::error::Result; -use crate::metadata::TableBucket; +use crate::error::{FlussError, Result}; +use crate::metadata::{TableBucket, TablePath}; use crate::proto::ProduceLogResponse; use crate::rpc::message::ProduceLogRequest; +use log::warn; use parking_lot::Mutex; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use std::time::Duration; @@ -32,7 +33,7 @@ pub struct Sender { running: bool, metadata: Arc, accumulator: Arc, - in_flight_batches: Mutex>>>, + in_flight_batches: Mutex>>, max_request_size: i32, ack: i16, max_request_timeout_ms: i32, @@ -99,30 +100,30 @@ impl Sender { if !batches.is_empty() { self.add_to_inflight_batches(&batches); - self.send_write_requests(&batches).await?; + self.send_write_requests(batches).await?; } Ok(()) } - fn add_to_inflight_batches(&self, batches: &HashMap>>) { + fn add_to_inflight_batches(&self, batches: &HashMap>) { let mut in_flight = self.in_flight_batches.lock(); for batch_list in batches.values() { for batch in batch_list { in_flight .entry(batch.table_bucket.clone()) .or_default() - .push(batch.clone()); + .push(batch.write_batch.batch_id()); } } } async fn send_write_requests( &self, - collated: &HashMap>>, + collated: HashMap>, ) -> Result<()> { for (leader_id, batches) in collated { - self.send_write_request(*leader_id, self.ack, batches) + self.send_write_request(leader_id, self.ack, batches) .await?; } Ok(()) @@ -132,78 +133,434 @@ impl Sender { &self, destination: i32, acks: i16, - batches: &Vec>, + batches: Vec, ) -> Result<()> { if batches.is_empty() { return Ok(()); } let mut records_by_bucket = HashMap::new(); - let mut write_batch_by_table = HashMap::new(); + let mut write_batch_by_table: HashMap> = HashMap::new(); for batch in batches { - records_by_bucket.insert(batch.table_bucket.clone(), batch.clone()); + let table_bucket = batch.table_bucket.clone(); write_batch_by_table - .entry(batch.table_bucket.table_id()) - .or_insert_with(Vec::new) - .push(batch); + .entry(table_bucket.table_id()) + .or_default() + .push(table_bucket.clone()); + records_by_bucket.insert(table_bucket, batch); } let cluster = self.metadata.get_cluster(); - let destination_node = - cluster - .get_tablet_server(destination) - .ok_or(Error::LeaderNotAvailable { - message: format!("destination node not found in metadata cache {destination}."), - })?; - let connection = self.metadata.get_connection(destination_node).await?; + let destination_node = match cluster.get_tablet_server(destination) { + Some(node) => node, + None => { + self.handle_batches_with_error( + records_by_bucket.into_values().collect(), + FlussError::LeaderNotAvailableException, + format!("Destination node not found in metadata cache {destination}."), + ) + .await?; + return Ok(()); + } + }; + let connection = match self.metadata.get_connection(destination_node).await { + Ok(connection) => connection, + Err(e) => { + self.handle_batches_with_error( + records_by_bucket.into_values().collect(), + FlussError::NetworkException, + format!("Failed to connect destination node {destination}: {e}"), + ) + .await?; + return Ok(()); + } + }; - for (table_id, write_batches) in write_batch_by_table { - let request = - ProduceLogRequest::new(table_id, acks, self.max_request_timeout_ms, write_batches)?; - let response = connection.request(request).await?; - self.handle_produce_response(table_id, &records_by_bucket, response)? + for (table_id, table_buckets) in write_batch_by_table { + let request_batches: Vec<&ReadyWriteBatch> = table_buckets + .iter() + .filter_map(|bucket| records_by_bucket.get(bucket)) + .collect(); + if request_batches.is_empty() { + continue; + } + let request = match ProduceLogRequest::new( + table_id, + acks, + self.max_request_timeout_ms, + request_batches.as_slice(), + ) { + Ok(request) => request, + Err(e) => { + self.handle_batches_with_local_error( + table_buckets + .iter() + .filter_map(|bucket| records_by_bucket.remove(bucket)) + .collect(), + format!("Failed to build produce request: {e}"), + ) + .await?; + continue; + } + }; + + let response = match connection.request(request).await { + Ok(response) => response, + Err(e) => { + self.handle_batches_with_error( + table_buckets + .iter() + .filter_map(|bucket| records_by_bucket.remove(bucket)) + .collect(), + FlussError::NetworkException, + format!("Failed to send produce request: {e}"), + ) + .await?; + continue; + } + }; + + self.handle_produce_response( + table_id, + &table_buckets, + &mut records_by_bucket, + response, + ) + .await?; } Ok(()) } - fn handle_produce_response( + async fn handle_produce_response( &self, table_id: i64, - records_by_bucket: &HashMap>, + request_buckets: &[TableBucket], + records_by_bucket: &mut HashMap, response: ProduceLogResponse, ) -> Result<()> { + let mut invalid_metadata_tables: HashSet = HashSet::new(); + let mut pending_buckets: HashSet = request_buckets.iter().cloned().collect(); for produce_log_response_for_bucket in response.buckets_resp.iter() { let tb = TableBucket::new(table_id, produce_log_response_for_bucket.bucket_id); - let ready_batch = records_by_bucket.get(&tb).unwrap(); + let Some(ready_batch) = records_by_bucket.remove(&tb) else { + panic!("Missing ready batch for table bucket {tb}"); + }; + pending_buckets.remove(&tb); + if let Some(error_code) = produce_log_response_for_bucket.error_code { - todo!("handle_produce_response error: {}", error_code) + if error_code == FlussError::None.code() { + self.complete_batch(ready_batch); + continue; + } + + let error = FlussError::for_code(error_code); + let message = produce_log_response_for_bucket + .error_message + .clone() + .unwrap_or_else(|| error.message().to_string()); + if let Some(table_path) = self + .handle_write_batch_error(ready_batch, error, message) + .await? + { + invalid_metadata_tables.insert(table_path); + } } else { self.complete_batch(ready_batch) } } + if !pending_buckets.is_empty() { + for bucket in pending_buckets { + if let Some(ready_batch) = records_by_bucket.remove(&bucket) { + let message = + format!("Missing response for table bucket {bucket} in produce response."); + let error = FlussError::UnknownServerError; + if let Some(table_path) = self + .handle_write_batch_error(ready_batch, error, message) + .await? + { + invalid_metadata_tables.insert(table_path); + } + } + } + } + self.update_metadata_if_needed(invalid_metadata_tables) + .await; Ok(()) } - fn complete_batch(&self, ready_write_batch: &Arc) { - if ready_write_batch.write_batch.complete(Ok(())) { - // remove from in flight batches - let mut in_flight_guard = self.in_flight_batches.lock(); - if let Some(in_flight) = in_flight_guard.get_mut(&ready_write_batch.table_bucket) { - in_flight.retain(|b| !Arc::ptr_eq(b, ready_write_batch)); - if in_flight.is_empty() { - in_flight_guard.remove(&ready_write_batch.table_bucket); - } - } + fn complete_batch(&self, ready_write_batch: ReadyWriteBatch) { + self.finish_batch(ready_write_batch, Ok(())); + } + + fn fail_batch(&self, ready_write_batch: ReadyWriteBatch, error: broadcast::Error) { + self.finish_batch(ready_write_batch, Err(error)); + } + + fn finish_batch(&self, ready_write_batch: ReadyWriteBatch, result: broadcast::Result<()>) { + if ready_write_batch.write_batch.complete(result) { + self.remove_from_inflight_batches(&ready_write_batch); // remove from incomplete batches self.accumulator .remove_incomplete_batches(ready_write_batch.write_batch.batch_id()) } } + async fn handle_batches_with_error( + &self, + batches: Vec, + error: FlussError, + message: String, + ) -> Result<()> { + let mut invalid_metadata_tables: HashSet = HashSet::new(); + for batch in batches { + if let Some(table_path) = self + .handle_write_batch_error(batch, error, message.clone()) + .await? + { + invalid_metadata_tables.insert(table_path); + } + } + self.update_metadata_if_needed(invalid_metadata_tables) + .await; + Ok(()) + } + + async fn handle_batches_with_local_error( + &self, + batches: Vec, + message: String, + ) -> Result<()> { + for batch in batches { + self.fail_batch( + batch, + broadcast::Error::Client { + message: message.clone(), + }, + ); + } + Ok(()) + } + + async fn handle_write_batch_error( + &self, + ready_write_batch: ReadyWriteBatch, + error: FlussError, + message: String, + ) -> Result> { + let table_path = ready_write_batch.write_batch.table_path().clone(); + if self.can_retry(&ready_write_batch, error) { + warn!( + "Retrying write batch for {table_path} on bucket {} after error {error:?}: {message}", + ready_write_batch.table_bucket.bucket_id() + ); + self.re_enqueue_batch(ready_write_batch).await; + return Ok(Self::is_invalid_metadata_error(error).then_some(table_path)); + } + + if error == FlussError::DuplicateSequenceException { + warn!( + "Duplicate sequence for {table_path} on bucket {}: {message}", + ready_write_batch.table_bucket.bucket_id() + ); + self.complete_batch(ready_write_batch); + return Ok(None); + } + + self.fail_batch( + ready_write_batch, + broadcast::Error::WriteFailed { + code: error.code(), + message, + }, + ); + Ok(Self::is_invalid_metadata_error(error).then_some(table_path)) + } + + async fn re_enqueue_batch(&self, ready_write_batch: ReadyWriteBatch) { + self.remove_from_inflight_batches(&ready_write_batch); + self.accumulator.re_enqueue(ready_write_batch).await; + } + + fn remove_from_inflight_batches(&self, ready_write_batch: &ReadyWriteBatch) { + let batch_id = ready_write_batch.write_batch.batch_id(); + let mut in_flight_guard = self.in_flight_batches.lock(); + if let Some(in_flight) = in_flight_guard.get_mut(&ready_write_batch.table_bucket) { + in_flight.retain(|id| *id != batch_id); + if in_flight.is_empty() { + in_flight_guard.remove(&ready_write_batch.table_bucket); + } + } + } + + fn can_retry(&self, ready_write_batch: &ReadyWriteBatch, error: FlussError) -> bool { + ready_write_batch.write_batch.attempts() < self.retries + && !ready_write_batch.write_batch.is_done() + && Self::is_retriable_error(error) + } + + async fn update_metadata_if_needed(&self, table_paths: HashSet) { + if table_paths.is_empty() { + return; + } + let table_path_refs: HashSet<&TablePath> = table_paths.iter().collect(); + if let Err(e) = self.metadata.update_tables_metadata(&table_path_refs).await { + warn!("Failed to update metadata after write error: {e:?}"); + } + } + + fn is_invalid_metadata_error(error: FlussError) -> bool { + matches!( + error, + FlussError::NotLeaderOrFollower + | FlussError::UnknownTableOrBucketException + | FlussError::LeaderNotAvailableException + | FlussError::NetworkException + ) + } + + fn is_retriable_error(error: FlussError) -> bool { + matches!( + error, + FlussError::NetworkException + | FlussError::NotLeaderOrFollower + | FlussError::UnknownTableOrBucketException + | FlussError::LeaderNotAvailableException + | FlussError::LogStorageException + | FlussError::KvStorageException + | FlussError::StorageException + | FlussError::RequestTimeOut + | FlussError::NotEnoughReplicasAfterAppendException + | FlussError::NotEnoughReplicasException + | FlussError::CorruptMessage + | FlussError::CorruptRecordException + ) + } + pub async fn close(&mut self) { self.running = false; } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::WriteRecord; + use crate::cluster::Cluster; + use crate::config::Config; + use crate::metadata::TablePath; + use crate::proto::{PbProduceLogRespForBucket, ProduceLogResponse}; + use crate::row::{Datum, GenericRow}; + use crate::rpc::FlussError; + use crate::test_utils::build_cluster_arc; + use std::collections::HashSet; + + async fn build_ready_batch( + accumulator: &RecordAccumulator, + cluster: Arc, + table_path: Arc, + ) -> Result<(ReadyWriteBatch, crate::client::ResultHandle)> { + let record = WriteRecord::new( + table_path, + GenericRow { + values: vec![Datum::Int32(1)], + }, + ); + let result = accumulator.append(&record, 0, &cluster, false).await?; + let result_handle = result.result_handle.expect("result handle"); + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024).await?; + let mut drained = batches.remove(&1).expect("drained batches"); + let batch = drained.pop().expect("batch"); + Ok((batch, result_handle)) + } + + #[tokio::test] + async fn handle_write_batch_error_retries() -> Result<()> { + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + let cluster = build_cluster_arc(table_path.as_ref(), 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster.clone())); + let accumulator = Arc::new(RecordAccumulator::new(Config::default())); + let sender = Sender::new(metadata, accumulator.clone(), 1024 * 1024, 1000, 1, 1); + + let (batch, _handle) = + build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path.clone()).await?; + let mut inflight = HashMap::new(); + inflight.insert(1, vec![batch]); + sender.add_to_inflight_batches(&inflight); + let batch = inflight.remove(&1).unwrap().pop().unwrap(); + + sender + .handle_write_batch_error(batch, FlussError::RequestTimeOut, "timeout".to_string()) + .await?; + + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024).await?; + let mut drained = batches.remove(&1).expect("drained batches"); + let batch = drained.pop().expect("batch"); + assert_eq!(batch.write_batch.attempts(), 1); + Ok(()) + } + + #[tokio::test] + async fn handle_write_batch_error_fails() -> Result<()> { + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + let cluster = build_cluster_arc(table_path.as_ref(), 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster.clone())); + let accumulator = Arc::new(RecordAccumulator::new(Config::default())); + let sender = Sender::new(metadata, accumulator.clone(), 1024 * 1024, 1000, 1, 0); + + let (batch, handle) = + build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path).await?; + sender + .handle_write_batch_error( + batch, + FlussError::InvalidTableException, + "invalid".to_string(), + ) + .await?; + + let batch_result = handle.wait().await?; + assert!(matches!( + batch_result, + Err(broadcast::Error::WriteFailed { code, .. }) + if code == FlussError::InvalidTableException.code() + )); + Ok(()) + } + + #[tokio::test] + async fn handle_produce_response_duplicate_sequence_completes() -> Result<()> { + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + let cluster = build_cluster_arc(table_path.as_ref(), 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster.clone())); + let accumulator = Arc::new(RecordAccumulator::new(Config::default())); + let sender = Sender::new(metadata, accumulator.clone(), 1024 * 1024, 1000, 1, 0); + + let (batch, handle) = build_ready_batch(accumulator.as_ref(), cluster, table_path).await?; + let request_buckets = vec![batch.table_bucket.clone()]; + let mut records_by_bucket = HashMap::new(); + records_by_bucket.insert(batch.table_bucket.clone(), batch); + + let response = ProduceLogResponse { + buckets_resp: vec![PbProduceLogRespForBucket { + bucket_id: 0, + error_code: Some(FlussError::DuplicateSequenceException.code()), + error_message: Some("dup".to_string()), + ..Default::default() + }], + }; + + sender + .handle_produce_response(1, &request_buckets, &mut records_by_bucket, response) + .await?; + + let batch_result = handle.wait().await?; + assert!(matches!(batch_result, Ok(()))); + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/lib.rs b/fluss-rust/crates/fluss/src/lib.rs index 1bd72a4aac..e8d822fb77 100644 --- a/fluss-rust/crates/fluss/src/lib.rs +++ b/fluss-rust/crates/fluss/src/lib.rs @@ -31,6 +31,9 @@ mod compression; pub mod io; mod util; +#[cfg(test)] +mod test_utils; + pub type TableId = u64; pub type PartitionId = u64; pub type BucketId = i32; diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs index 78872a9dd4..c054e08aeb 100644 --- a/fluss-rust/crates/fluss/src/row/datum.rs +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -448,3 +448,69 @@ impl Date { date.day() } } + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{Array, Int32Builder, StringBuilder}; + + #[test] + fn datum_accessors_and_conversions() { + let datum = Datum::String("value"); + assert_eq!(datum.as_str(), "value"); + assert!(!datum.is_null()); + + let blob = Blob::from(vec![1, 2, 3]); + let datum = Datum::Blob(blob); + assert_eq!(datum.as_blob(), &[1, 2, 3]); + + assert!(Datum::Null.is_null()); + + let datum = Datum::Int32(42); + let value: i32 = (&datum).try_into().unwrap(); + assert_eq!(value, 42); + let value: std::result::Result = (&datum).try_into(); + assert!(value.is_err()); + } + + #[test] + fn datum_append_to_builder() { + let mut builder = Int32Builder::new(); + Datum::Null.append_to(&mut builder).unwrap(); + Datum::Int32(5).append_to(&mut builder).unwrap(); + let array = builder.finish(); + assert!(array.is_null(0)); + assert_eq!(array.value(1), 5); + + let mut builder = StringBuilder::new(); + let err = Datum::Int32(1).append_to(&mut builder).unwrap_err(); + assert!(matches!(err, crate::error::Error::RowConvertError { .. })); + + let mut builder = Int32Builder::new(); + let err = Datum::Date(Date::new(0)) + .append_to(&mut builder) + .unwrap_err(); + assert!(matches!(err, crate::error::Error::RowConvertError { .. })); + } + + #[test] + #[should_panic] + fn datum_as_str_panics_on_non_string() { + let _ = Datum::Int32(1).as_str(); + } + + #[test] + #[should_panic] + fn datum_as_blob_panics_on_non_blob() { + let _ = Datum::Int16(1).as_blob(); + } + + #[test] + fn date_components() { + let date = Date::new(0); + assert_eq!(date.get_inner(), 0); + assert_eq!(date.year(), 1970); + assert_eq!(date.month(), 1); + assert_eq!(date.day(), 1); + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/api_key.rs b/fluss-rust/crates/fluss/src/rpc/api_key.rs index b11647f960..c51539642b 100644 --- a/fluss-rust/crates/fluss/src/rpc/api_key.rs +++ b/fluss-rust/crates/fluss/src/rpc/api_key.rs @@ -85,3 +85,41 @@ impl From for i16 { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn api_key_round_trip() { + let cases = [ + (1001, ApiKey::CreateDatabase), + (1002, ApiKey::DropDatabase), + (1003, ApiKey::ListDatabases), + (1004, ApiKey::DatabaseExists), + (1005, ApiKey::CreateTable), + (1006, ApiKey::DropTable), + (1007, ApiKey::GetTable), + (1008, ApiKey::ListTables), + (1010, ApiKey::TableExists), + (1012, ApiKey::MetaData), + (1014, ApiKey::ProduceLog), + (1015, ApiKey::FetchLog), + (1021, ApiKey::ListOffsets), + (1025, ApiKey::GetFileSystemSecurityToken), + (1032, ApiKey::GetLatestLakeSnapshot), + (1035, ApiKey::GetDatabaseInfo), + ]; + + for (raw, key) in cases { + assert_eq!(ApiKey::from(raw), key); + let mapped: i16 = key.into(); + assert_eq!(mapped, raw); + } + + let unknown = ApiKey::from(9999); + assert_eq!(unknown, ApiKey::Unknown(9999)); + let mapped: i16 = unknown.into(); + assert_eq!(mapped, 9999); + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/api_version.rs b/fluss-rust/crates/fluss/src/rpc/api_version.rs index 395c45cd0f..f009d6914f 100644 --- a/fluss-rust/crates/fluss/src/rpc/api_version.rs +++ b/fluss-rust/crates/fluss/src/rpc/api_version.rs @@ -52,3 +52,28 @@ impl std::fmt::Display for ApiVersionRange { write!(f, "{}:{}", self.min, self.max) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn api_version_display() { + let version = ApiVersion(3); + assert_eq!(version.to_string(), "3"); + } + + #[test] + fn api_version_range_accessors() { + let range = ApiVersionRange::new(ApiVersion(1), ApiVersion(4)); + assert_eq!(range.min(), ApiVersion(1)); + assert_eq!(range.max(), ApiVersion(4)); + assert_eq!(range.to_string(), "1:4"); + } + + #[test] + #[should_panic] + fn api_version_range_panics_on_invalid_bounds() { + let _ = ApiVersionRange::new(ApiVersion(4), ApiVersion(1)); + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/convert.rs b/fluss-rust/crates/fluss/src/rpc/convert.rs index 6feb7eb8af..1862589bc5 100644 --- a/fluss-rust/crates/fluss/src/rpc/convert.rs +++ b/fluss-rust/crates/fluss/src/rpc/convert.rs @@ -41,3 +41,51 @@ pub fn from_pb_table_path(pb_table_path: &PbTablePath) -> TablePath { pb_table_path.table_name.to_string(), ) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::proto::{PbServerNode, PbTablePath}; + + #[test] + fn table_path_round_trip() { + let table_path = TablePath::new("db".to_string(), "table".to_string()); + let pb = to_table_path(&table_path); + assert_eq!(pb.database_name, "db"); + assert_eq!(pb.table_name, "table"); + + let restored = from_pb_table_path(&pb); + assert_eq!(restored, table_path); + + let manual = PbTablePath { + database_name: "db2".to_string(), + table_name: "table2".to_string(), + }; + let restored = from_pb_table_path(&manual); + assert_eq!(restored.database(), "db2"); + assert_eq!(restored.table(), "table2"); + } + + #[test] + fn server_node_from_pb() { + let pb = PbServerNode { + node_id: 7, + host: "127.0.0.1".to_string(), + port: 9092, + listeners: None, + }; + let node = from_pb_server_node(pb, ServerType::TabletServer); + assert_eq!(node.id(), 7); + assert_eq!(node.url(), "127.0.0.1:9092"); + assert_eq!(node.uid(), "ts-7"); + + let pb = PbServerNode { + node_id: 3, + host: "localhost".to_string(), + port: 8123, + listeners: None, + }; + let node = from_pb_server_node(pb, ServerType::CoordinatorServer); + assert_eq!(node.uid(), "cs-3"); + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/fluss_api_error.rs b/fluss-rust/crates/fluss/src/rpc/fluss_api_error.rs index b26eb72f61..a501b9974e 100644 --- a/fluss-rust/crates/fluss/src/rpc/fluss_api_error.rs +++ b/fluss-rust/crates/fluss/src/rpc/fluss_api_error.rs @@ -369,3 +369,38 @@ impl From for FlussError { FlussError::for_code(api_error.code) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn for_code_maps_known_and_unknown() { + assert_eq!(FlussError::for_code(0), FlussError::None); + assert_eq!( + FlussError::for_code(FlussError::AuthorizationException.code()), + FlussError::AuthorizationException + ); + assert_eq!(FlussError::for_code(9999), FlussError::UnknownServerError); + } + + #[test] + fn to_api_error_uses_message() { + let err = FlussError::InvalidTableException.to_api_error(None); + assert_eq!(err.code, FlussError::InvalidTableException.code()); + assert!(err.message.contains("invalid table")); + } + + #[test] + fn error_response_conversion_round_trip() { + let response = ErrorResponse { + error_code: FlussError::TableNotExist.code(), + error_message: Some("missing".to_string()), + }; + let api_error = ApiError::from(response); + assert_eq!(api_error.code, FlussError::TableNotExist.code()); + assert_eq!(api_error.message, "missing"); + let fluss_error = FlussError::from(api_error); + assert_eq!(fluss_error, FlussError::TableNotExist); + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs b/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs index 39bfb3f205..eb725751c7 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs @@ -19,14 +19,12 @@ use crate::error::Result as FlussResult; use crate::proto::{PbProduceLogReqForBucket, ProduceLogResponse}; use crate::rpc::frame::ReadError; +use crate::client::ReadyWriteBatch; use crate::rpc::api_key::ApiKey; use crate::rpc::api_version::ApiVersion; use crate::rpc::frame::WriteError; use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; use crate::{impl_read_version_type, impl_write_version_type, proto}; -use std::sync::Arc; - -use crate::client::ReadyWriteBatch; use bytes::{Buf, BufMut}; use prost::Message; @@ -39,7 +37,7 @@ impl ProduceLogRequest { table_id: i64, ack: i16, max_request_timeout_ms: i32, - ready_batches: Vec<&Arc>, + ready_batches: &[&ReadyWriteBatch], ) -> FlussResult { let mut request = proto::ProduceLogRequest { table_id, diff --git a/fluss-rust/crates/fluss/src/test_utils.rs b/fluss-rust/crates/fluss/src/test_utils.rs new file mode 100644 index 0000000000..d1cd3ec712 --- /dev/null +++ b/fluss-rust/crates/fluss/src/test_utils.rs @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::cluster::{BucketLocation, Cluster, ServerNode, ServerType}; +use crate::metadata::{ + DataField, DataTypes, Schema, TableBucket, TableDescriptor, TableInfo, TablePath, +}; +use std::collections::HashMap; +use std::sync::Arc; + +pub(crate) fn build_table_info(table_path: TablePath, table_id: i64, buckets: i32) -> TableInfo { + let row_type = DataTypes::row(vec![DataField::new( + "id".to_string(), + DataTypes::int(), + None, + )]); + let mut schema_builder = Schema::builder().with_row_type(&row_type); + let schema = schema_builder.build().expect("schema build"); + let table_descriptor = TableDescriptor::builder() + .schema(schema) + .distributed_by(Some(buckets), vec![]) + .build() + .expect("descriptor build"); + TableInfo::of(table_path, table_id, 1, table_descriptor, 0, 0) +} + +pub(crate) fn build_cluster(table_path: &TablePath, table_id: i64, buckets: i32) -> Cluster { + let server = ServerNode::new(1, "127.0.0.1".to_string(), 9092, ServerType::TabletServer); + + let mut servers = HashMap::new(); + servers.insert(server.id(), server.clone()); + + let mut locations_by_path = HashMap::new(); + let mut locations_by_bucket = HashMap::new(); + let mut bucket_locations = Vec::new(); + + for bucket_id in 0..buckets { + let table_bucket = TableBucket::new(table_id, bucket_id); + let bucket_location = BucketLocation::new( + table_bucket.clone(), + Some(server.clone()), + table_path.clone(), + ); + bucket_locations.push(bucket_location.clone()); + locations_by_bucket.insert(table_bucket, bucket_location); + } + locations_by_path.insert(table_path.clone(), bucket_locations); + + let mut table_id_by_path = HashMap::new(); + table_id_by_path.insert(table_path.clone(), table_id); + + let mut table_info_by_path = HashMap::new(); + table_info_by_path.insert( + table_path.clone(), + build_table_info(table_path.clone(), table_id, buckets), + ); + + Cluster::new( + None, + servers, + locations_by_path, + locations_by_bucket, + table_id_by_path, + table_info_by_path, + ) +} + +pub(crate) fn build_cluster_arc( + table_path: &TablePath, + table_id: i64, + buckets: i32, +) -> Arc { + Arc::new(build_cluster(table_path, table_id, buckets)) +} From 69b3ccc3d0d8d1bf5cd8c3fc6aac2f2e6bbf8349 Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Sun, 11 Jan 2026 03:16:49 +0000 Subject: [PATCH 059/287] chore: improve datum, use Cow to reduce verbosity and avoid Box::leak gymnastics (#139) --- fluss-rust/bindings/cpp/src/types.rs | 6 +- .../fluss/src/row/binary/binary_writer.rs | 9 +- .../src/row/compacted/compacted_row_reader.rs | 7 +- fluss-rust/crates/fluss/src/row/datum.rs | 95 ++++--------------- .../crates/fluss/src/row/field_getter.rs | 2 +- 5 files changed, 32 insertions(+), 87 deletions(-) diff --git a/fluss-rust/bindings/cpp/src/types.rs b/fluss-rust/bindings/cpp/src/types.rs index d95da14212..fef73ceaf0 100644 --- a/fluss-rust/bindings/cpp/src/types.rs +++ b/fluss-rust/bindings/cpp/src/types.rs @@ -25,6 +25,7 @@ use arrow::array::{ use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; use fcore::row::InternalRow; use fluss as fcore; +use std::borrow::Cow; pub const DATA_TYPE_BOOLEAN: i32 = 1; pub const DATA_TYPE_TINYINT: i32 = 2; @@ -218,9 +219,8 @@ pub fn ffi_row_to_core(row: &ffi::FfiGenericRow) -> fcore::row::GenericRow<'_> { DATUM_TYPE_INT64 => Datum::Int64(field.i64_val), DATUM_TYPE_FLOAT32 => Datum::Float32(field.f32_val.into()), DATUM_TYPE_FLOAT64 => Datum::Float64(field.f64_val.into()), - DATUM_TYPE_STRING => Datum::String(field.string_val.as_str()), - // todo: avoid copy bytes for blob - DATUM_TYPE_BYTES => Datum::Blob(field.bytes_val.clone().into()), + DATUM_TYPE_STRING => Datum::String(Cow::Borrowed(field.string_val.as_str())), + DATUM_TYPE_BYTES => Datum::Blob(Cow::Borrowed(field.bytes_val.as_slice())), _ => Datum::Null, }; generic_row.set_field(idx, datum); diff --git a/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs b/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs index 44f10b6309..9917c7b76a 100644 --- a/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs +++ b/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs @@ -170,17 +170,12 @@ impl InnerValueWriter { writer.write_boolean(*v); } (InnerValueWriter::Binary, Datum::Blob(v)) => { - writer.write_binary(v.as_ref(), v.len()); - } - (InnerValueWriter::Binary, Datum::BorrowedBlob(v)) => { - writer.write_binary(v.as_ref(), v.len()); + let b = v.as_ref(); + writer.write_binary(b, b.len()); } (InnerValueWriter::Bytes, Datum::Blob(v)) => { writer.write_bytes(v.as_ref()); } - (InnerValueWriter::Bytes, Datum::BorrowedBlob(v)) => { - writer.write_bytes(v.as_ref()); - } (InnerValueWriter::TinyInt, Datum::Int8(v)) => { writer.write_byte(*v as u8); } diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs index 19afe887d4..00d94ad675 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs @@ -16,6 +16,7 @@ // under the License. use bytes::Bytes; +use std::borrow::Cow; use crate::{ metadata::DataType, @@ -52,10 +53,12 @@ impl CompactedRowDeserializer { DataType::Float(_) => Datum::Float32(reader.read_float().into()), DataType::Double(_) => Datum::Float64(reader.read_double().into()), // TODO: use read_char(length) in the future, but need to keep compatibility - DataType::Char(_) | DataType::String(_) => Datum::OwnedString(reader.read_string()), + DataType::Char(_) | DataType::String(_) => { + Datum::String(Cow::Owned(reader.read_string())) + } // TODO: use read_binary(length) in the future, but need to keep compatibility DataType::Bytes(_) | DataType::Binary(_) => { - Datum::Blob(reader.read_bytes().into_vec().into()) + Datum::Blob(Cow::Owned(reader.read_bytes().into_vec())) } _ => panic!("unsupported DataType in CompactedRowDeserializer"), }; diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs index c054e08aeb..fa85ded4b8 100644 --- a/fluss-rust/crates/fluss/src/row/datum.rs +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -24,11 +24,9 @@ use arrow::array::{ use jiff::ToSpan; use ordered_float::OrderedFloat; use parse_display::Display; -use ref_cast::RefCast; use rust_decimal::Decimal; -use serde::{Deserialize, Serialize}; -use std::fmt; -use std::ops::Deref; +use serde::Serialize; +use std::borrow::Cow; #[allow(dead_code)] const THIRTY_YEARS_MICROSECONDS: i64 = 946_684_800_000_000; @@ -52,14 +50,9 @@ pub enum Datum<'a> { #[display("{0}")] Float64(F64), #[display("'{0}'")] - String(&'a str), - /// Owned string - #[display("'{0}'")] - OwnedString(String), - #[display("{0}")] - Blob(Blob), + String(Str<'a>), #[display("{:?}")] - BorrowedBlob(&'a [u8]), + Blob(Blob<'a>), #[display("{0}")] Decimal(Decimal), #[display("{0}")] @@ -78,7 +71,6 @@ impl Datum<'_> { pub fn as_str(&self) -> &str { match self { Self::String(s) => s, - Self::OwnedString(s) => s.as_str(), _ => panic!("not a string: {self:?}"), } } @@ -86,7 +78,6 @@ impl Datum<'_> { pub fn as_blob(&self) -> &[u8] { match self { Self::Blob(blob) => blob.as_ref(), - Self::BorrowedBlob(blob) => blob, _ => panic!("not a blob: {self:?}"), } } @@ -121,10 +112,19 @@ impl<'a> From for Datum<'a> { } } +pub type Str<'a> = Cow<'a, str>; + +impl<'a> From for Datum<'a> { + #[inline] + fn from(s: String) -> Self { + Datum::String(Cow::Owned(s)) + } +} + impl<'a> From<&'a str> for Datum<'a> { #[inline] fn from(s: &'a str) -> Datum<'a> { - Datum::String(s) + Datum::String(Cow::Borrowed(s)) } } @@ -226,8 +226,7 @@ impl<'b, 'a: 'b> TryFrom<&'b Datum<'a>> for &'b str { #[inline] fn try_from(from: &'b Datum<'a>) -> std::result::Result { match from { - Datum::String(i) => Ok(*i), - Datum::OwnedString(s) => Ok(s.as_str()), + Datum::String(s) => Ok(s.as_ref()), _ => Err(()), } } @@ -295,10 +294,8 @@ impl Datum<'_> { Datum::Int64(v) => append_value_to_arrow!(Int64Builder, *v), Datum::Float32(v) => append_value_to_arrow!(Float32Builder, v.into_inner()), Datum::Float64(v) => append_value_to_arrow!(Float64Builder, v.into_inner()), - Datum::String(v) => append_value_to_arrow!(StringBuilder, *v), - Datum::OwnedString(v) => append_value_to_arrow!(StringBuilder, v.as_str()), + Datum::String(v) => append_value_to_arrow!(StringBuilder, v.as_ref()), Datum::Blob(v) => append_value_to_arrow!(BinaryBuilder, v.as_ref()), - Datum::BorrowedBlob(v) => append_value_to_arrow!(BinaryBuilder, *v), Datum::Decimal(_) | Datum::Date(_) | Datum::Timestamp(_) | Datum::TimestampTz(_) => { return Err(RowConvertError { message: format!( @@ -349,58 +346,6 @@ impl_to_arrow!(&str, StringBuilder); pub type F32 = OrderedFloat; pub type F64 = OrderedFloat; -#[allow(dead_code)] -pub type Str = Box; - -#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Serialize, Deserialize, Default)] -pub struct Blob(Box<[u8]>); - -impl Deref for Blob { - type Target = BlobRef; - - fn deref(&self) -> &Self::Target { - BlobRef::new(&self.0) - } -} - -impl BlobRef { - pub fn new(bytes: &[u8]) -> &Self { - // SAFETY: `&BlobRef` and `&[u8]` have the same layout. - BlobRef::ref_cast(bytes) - } -} - -/// A slice of a blob. -#[repr(transparent)] -#[derive(PartialEq, Eq, PartialOrd, Ord, RefCast, Hash)] -pub struct BlobRef([u8]); - -impl fmt::Debug for Blob { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{:?}", self.as_ref()) - } -} - -impl fmt::Display for Blob { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{:?}", self.as_ref()) - } -} - -impl AsRef<[u8]> for BlobRef { - fn as_ref(&self) -> &[u8] { - &self.0 - } -} - -impl Deref for BlobRef { - type Target = [u8]; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - #[derive(PartialOrd, Ord, Display, PartialEq, Eq, Debug, Copy, Clone, Default, Hash, Serialize)] pub struct Date(i32); @@ -410,15 +355,17 @@ pub struct Timestamp(i64); #[derive(PartialOrd, Ord, Display, PartialEq, Eq, Debug, Copy, Clone, Default, Hash, Serialize)] pub struct TimestampLtz(i64); -impl From> for Blob { +pub type Blob<'a> = Cow<'a, [u8]>; + +impl<'a> From> for Datum<'a> { fn from(vec: Vec) -> Self { - Blob(vec.into()) + Datum::Blob(Blob::from(vec)) } } impl<'a> From<&'a [u8]> for Datum<'a> { fn from(bytes: &'a [u8]) -> Datum<'a> { - Datum::BorrowedBlob(bytes) + Datum::Blob(Blob::from(bytes)) } } diff --git a/fluss-rust/crates/fluss/src/row/field_getter.rs b/fluss-rust/crates/fluss/src/row/field_getter.rs index 3a9cf0fa81..8e529e5446 100644 --- a/fluss-rust/crates/fluss/src/row/field_getter.rs +++ b/fluss-rust/crates/fluss/src/row/field_getter.rs @@ -83,7 +83,7 @@ pub enum InnerFieldGetter { impl InnerFieldGetter { pub fn get_field<'a>(&self, row: &'a dyn InternalRow) -> Datum<'a> { match self { - InnerFieldGetter::Char { pos, len } => Datum::String(row.get_char(*pos, *len)), + InnerFieldGetter::Char { pos, len } => Datum::from(row.get_char(*pos, *len)), InnerFieldGetter::String { pos } => Datum::from(row.get_string(*pos)), InnerFieldGetter::Bool { pos } => Datum::from(row.get_boolean(*pos)), InnerFieldGetter::Binary { pos, len } => Datum::from(row.get_binary(*pos, *len)), From e9d16ae99d472dc38a292839125e51693fd822cb Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Sun, 11 Jan 2026 21:36:46 +0800 Subject: [PATCH 060/287] chore: implement zero copy in CompactedRowReader (#138) --- .../fluss/src/row/compacted/compacted_row.rs | 137 ++++------ .../src/row/compacted/compacted_row_reader.rs | 239 +++++++++--------- .../src/row/compacted/compacted_row_writer.rs | 4 +- fluss-rust/crates/fluss/src/row/datum.rs | 2 +- 4 files changed, 178 insertions(+), 204 deletions(-) diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs index fca41c655f..481f9be502 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use bytes::Bytes; +use std::sync::OnceLock; use crate::metadata::DataType; use crate::row::compacted::compacted_row_reader::{CompactedRowDeserializer, CompactedRowReader}; @@ -24,125 +24,95 @@ use crate::row::{GenericRow, InternalRow}; // Reference implementation: // https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/row/compacted/CompactedRow.java #[allow(dead_code)] -pub struct CompactedRow { +pub struct CompactedRow<'a> { arity: usize, - segment: Bytes, - offset: usize, size_in_bytes: usize, - decoded: bool, - decoded_row: GenericRow<'static>, - reader: CompactedRowReader, - deserializer: CompactedRowDeserializer, + decoded_row: OnceLock>, + deserializer: CompactedRowDeserializer<'a>, + reader: CompactedRowReader<'a>, + data_types: &'a [DataType], } -#[allow(dead_code)] -impl CompactedRow { - pub fn calculate_bit_set_width_in_bytes(arity: usize) -> usize { - arity.div_ceil(8) - } - - pub fn new(types: Vec) -> Self { - let arity = types.len(); - Self { - arity, - segment: Bytes::new(), - offset: 0, - size_in_bytes: 0, - decoded: false, - decoded_row: GenericRow::new(), - reader: CompactedRowReader::new(arity), - deserializer: CompactedRowDeserializer::new(types), - } - } +pub fn calculate_bit_set_width_in_bytes(arity: usize) -> usize { + arity.div_ceil(8) +} - pub fn from_bytes(types: Vec, data: Bytes) -> Self { - let arity = types.len(); +#[allow(dead_code)] +impl<'a> CompactedRow<'a> { + pub fn from_bytes(data_types: &'a [DataType], data: &'a [u8]) -> Self { + let arity = data_types.len(); let size = data.len(); Self { arity, - segment: data, - offset: 0, size_in_bytes: size, - decoded: false, - decoded_row: GenericRow::new(), - reader: CompactedRowReader::new(arity), - deserializer: CompactedRowDeserializer::new(types), + decoded_row: OnceLock::new(), + deserializer: CompactedRowDeserializer::new(data_types), + reader: CompactedRowReader::new(arity, data, 0, size), + data_types, } } - pub fn point_to(&mut self, segment: Bytes, offset: usize, size_in_bytes: usize) { - self.segment = segment; - self.offset = offset; - self.size_in_bytes = size_in_bytes; - self.decoded = false; - } - - pub fn get_segment(&self) -> &Bytes { - &self.segment - } - - pub fn get_offset(&self) -> usize { - self.offset - } - pub fn get_size_in_bytes(&self) -> usize { self.size_in_bytes } - pub fn get_field_count(&self) -> usize { - self.arity + fn decoded_row(&self) -> &GenericRow<'_> { + self.decoded_row + .get_or_init(|| self.deserializer.deserialize(&self.reader)) } +} - pub fn is_null_at(&self, pos: usize) -> bool { - let byte_index = pos >> 3; - let bit = pos & 7; - let idx = self.offset + byte_index; - (self.segment[idx] & (1u8 << bit)) != 0 +#[allow(dead_code)] +impl<'a> InternalRow for CompactedRow<'a> { + fn get_field_count(&self) -> usize { + self.arity } - fn decoded_row(&mut self) -> &GenericRow<'static> { - if !self.decoded { - self.reader - .point_to(self.segment.clone(), self.offset, self.size_in_bytes); - self.decoded_row = self.deserializer.deserialize(&mut self.reader); - self.decoded = true; - } - &self.decoded_row + fn is_null_at(&self, pos: usize) -> bool { + self.data_types[pos].is_nullable() && self.reader.is_null_at(pos) } - pub fn get_boolean(&mut self, pos: usize) -> bool { + fn get_boolean(&self, pos: usize) -> bool { self.decoded_row().get_boolean(pos) } - pub fn get_byte(&mut self, pos: usize) -> i8 { + fn get_byte(&self, pos: usize) -> i8 { self.decoded_row().get_byte(pos) } - pub fn get_short(&mut self, pos: usize) -> i16 { + fn get_short(&self, pos: usize) -> i16 { self.decoded_row().get_short(pos) } - pub fn get_int(&mut self, pos: usize) -> i32 { + fn get_int(&self, pos: usize) -> i32 { self.decoded_row().get_int(pos) } - pub fn get_long(&mut self, pos: usize) -> i64 { + fn get_long(&self, pos: usize) -> i64 { self.decoded_row().get_long(pos) } - pub fn get_float(&mut self, pos: usize) -> f32 { + fn get_float(&self, pos: usize) -> f32 { self.decoded_row().get_float(pos) } - pub fn get_double(&mut self, pos: usize) -> f64 { + fn get_double(&self, pos: usize) -> f64 { self.decoded_row().get_double(pos) } - pub fn get_string(&mut self, pos: usize) -> &str { + fn get_char(&self, pos: usize, length: usize) -> &str { + self.decoded_row().get_char(pos, length) + } + + fn get_string(&self, pos: usize) -> &str { self.decoded_row().get_string(pos) } - pub fn get_bytes(&mut self, pos: usize) -> &[u8] { + fn get_binary(&self, pos: usize, length: usize) -> &[u8] { + self.decoded_row().get_binary(pos, length) + } + + fn get_bytes(&self, pos: usize) -> &[u8] { self.decoded_row().get_bytes(pos) } } @@ -171,7 +141,6 @@ mod tests { DataType::Bytes(BytesType::new()), ]; - let mut row = CompactedRow::new(types.clone()); let mut writer = CompactedRowWriter::new(types.len()); writer.write_boolean(true); @@ -184,7 +153,8 @@ mod tests { writer.write_string("Hello World"); writer.write_bytes(&[1, 2, 3, 4, 5]); - row.point_to(writer.to_bytes(), 0, writer.position()); + let bytes = writer.to_bytes(); + let mut row = CompactedRow::from_bytes(types.as_slice(), bytes.as_ref()); assert_eq!(row.get_field_count(), 9); assert!(row.get_boolean(0)); @@ -204,14 +174,14 @@ mod tests { DataType::Double(DoubleType::new()), ]; - let mut row = CompactedRow::new(types.clone()); let mut writer = CompactedRowWriter::new(types.len()); writer.write_int(100); writer.set_null_at(1); writer.write_double(2.71); - row.point_to(writer.to_bytes(), 0, writer.position()); + let bytes = writer.to_bytes(); + row = CompactedRow::from_bytes(types.as_slice(), bytes.as_ref()); assert!(!row.is_null_at(0)); assert!(row.is_null_at(1)); @@ -230,12 +200,13 @@ mod tests { ]; let mut writer = CompactedRowWriter::new(types.len()); - writer.write_int(42); + writer.write_int(-1); writer.write_string("test"); - let mut row = CompactedRow::from_bytes(types, writer.to_bytes()); + let bytes = writer.to_bytes(); + let mut row = CompactedRow::from_bytes(types.as_slice(), bytes.as_ref()); - assert_eq!(row.get_int(0), 42); + assert_eq!(row.get_int(0), -1); assert_eq!(row.get_string(1), "test"); // Test large row @@ -244,14 +215,14 @@ mod tests { .map(|_| DataType::Int(IntType::new())) .collect(); - let mut row = CompactedRow::new(types.clone()); let mut writer = CompactedRowWriter::new(num_fields); for i in 0..num_fields { writer.write_int((i * 10) as i32); } - row.point_to(writer.to_bytes(), 0, writer.position()); + let bytes = writer.to_bytes(); + row = CompactedRow::from_bytes(types.as_slice(), bytes.as_ref()); for i in 0..num_fields { assert_eq!(row.get_int(i), (i * 10) as i32); diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs index 00d94ad675..c053d4ec31 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs @@ -15,54 +15,75 @@ // specific language governing permissions and limitations // under the License. -use bytes::Bytes; -use std::borrow::Cow; - +use crate::row::compacted::compacted_row::calculate_bit_set_width_in_bytes; use crate::{ metadata::DataType, - row::{ - Datum, GenericRow, - compacted::{compacted_row::CompactedRow, compacted_row_writer::CompactedRowWriter}, - }, + row::{Datum, GenericRow, compacted::compacted_row_writer::CompactedRowWriter}, }; +use std::str::from_utf8; #[allow(dead_code)] -pub struct CompactedRowDeserializer { - schema: Vec, +pub struct CompactedRowDeserializer<'a> { + schema: &'a [DataType], } #[allow(dead_code)] -impl CompactedRowDeserializer { - pub fn new(schema: Vec) -> Self { +impl<'a> CompactedRowDeserializer<'a> { + pub fn new(schema: &'a [DataType]) -> Self { Self { schema } } - pub fn deserialize(&self, reader: &mut CompactedRowReader) -> GenericRow<'static> { + pub fn deserialize(&self, reader: &CompactedRowReader<'a>) -> GenericRow<'a> { let mut row = GenericRow::new(); - for (pos, dtype) in self.schema.iter().enumerate() { - if reader.is_null_at(pos) { - row.set_field(pos, Datum::Null); + let mut cursor = reader.initial_position(); + for (col_pos, dtype) in self.schema.iter().enumerate() { + if dtype.is_nullable() && reader.is_null_at(col_pos) { + row.set_field(col_pos, Datum::Null); continue; } - let datum = match dtype { - DataType::Boolean(_) => Datum::Bool(reader.read_boolean()), - DataType::TinyInt(_) => Datum::Int8(reader.read_byte() as i8), - DataType::SmallInt(_) => Datum::Int16(reader.read_short()), - DataType::Int(_) => Datum::Int32(reader.read_int()), - DataType::BigInt(_) => Datum::Int64(reader.read_long()), - DataType::Float(_) => Datum::Float32(reader.read_float().into()), - DataType::Double(_) => Datum::Float64(reader.read_double().into()), + let (datum, next_cursor) = match dtype { + DataType::Boolean(_) => { + let (val, next) = reader.read_boolean(cursor); + (Datum::Bool(val), next) + } + DataType::TinyInt(_) => { + let (val, next) = reader.read_byte(cursor); + (Datum::Int8(val as i8), next) + } + DataType::SmallInt(_) => { + let (val, next) = reader.read_short(cursor); + (Datum::Int16(val), next) + } + DataType::Int(_) => { + let (val, next) = reader.read_int(cursor); + (Datum::Int32(val), next) + } + DataType::BigInt(_) => { + let (val, next) = reader.read_long(cursor); + (Datum::Int64(val), next) + } + DataType::Float(_) => { + let (val, next) = reader.read_float(cursor); + (Datum::Float32(val.into()), next) + } + DataType::Double(_) => { + let (val, next) = reader.read_double(cursor); + (Datum::Float64(val.into()), next) + } // TODO: use read_char(length) in the future, but need to keep compatibility DataType::Char(_) | DataType::String(_) => { - Datum::String(Cow::Owned(reader.read_string())) + let (val, next) = reader.read_string(cursor); + (Datum::String(val.into()), next) } // TODO: use read_binary(length) in the future, but need to keep compatibility DataType::Bytes(_) | DataType::Binary(_) => { - Datum::Blob(Cow::Owned(reader.read_bytes().into_vec())) + let (val, next) = reader.read_bytes(cursor); + (Datum::Blob(val.into()), next) } _ => panic!("unsupported DataType in CompactedRowDeserializer"), }; - row.set_field(pos, datum); + cursor = next_cursor; + row.set_field(col_pos, datum); } row } @@ -71,151 +92,133 @@ impl CompactedRowDeserializer { // Reference implementation: // https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/row/compacted/CompactedRowReader.java #[allow(dead_code)] -pub struct CompactedRowReader { - segment: Bytes, +pub struct CompactedRowReader<'a> { + segment: &'a [u8], offset: usize, - position: usize, limit: usize, header_size_in_bytes: usize, } #[allow(dead_code)] -impl CompactedRowReader { - pub fn new(field_count: usize) -> Self { - let header = CompactedRow::calculate_bit_set_width_in_bytes(field_count); - Self { - header_size_in_bytes: header, - segment: Bytes::new(), - offset: 0, - position: 0, - limit: 0, - } - } - - pub fn point_to(&mut self, data: Bytes, offset: usize, length: usize) { +impl<'a> CompactedRowReader<'a> { + pub fn new(field_count: usize, data: &'a [u8], offset: usize, length: usize) -> Self { + let header_size_in_bytes = calculate_bit_set_width_in_bytes(field_count); let limit = offset + length; - let position = offset + self.header_size_in_bytes; - + let position = offset + header_size_in_bytes; debug_assert!(limit <= data.len()); debug_assert!(position <= limit); - self.segment = data; - self.offset = offset; - self.position = position; - self.limit = limit; + CompactedRowReader { + segment: data, + offset, + limit, + header_size_in_bytes, + } } - pub fn is_null_at(&self, pos: usize) -> bool { - let byte_index = pos >> 3; - let bit = pos & 7; + fn initial_position(&self) -> usize { + self.offset + self.header_size_in_bytes + } + + pub fn is_null_at(&self, col_pos: usize) -> bool { + let byte_index = col_pos >> 3; + let bit = col_pos & 7; debug_assert!(byte_index < self.header_size_in_bytes); let idx = self.offset + byte_index; (self.segment[idx] & (1u8 << bit)) != 0 } - pub fn read_boolean(&mut self) -> bool { - self.read_byte() != 0 + pub fn read_boolean(&self, pos: usize) -> (bool, usize) { + let (val, next) = self.read_byte(pos); + (val != 0, next) } - pub fn read_byte(&mut self) -> u8 { - debug_assert!(self.position < self.limit); - let b = self.segment[self.position]; - self.position += 1; - b + pub fn read_byte(&self, pos: usize) -> (u8, usize) { + debug_assert!(pos < self.limit); + (self.segment[pos], pos + 1) } - pub fn read_short(&mut self) -> i16 { - debug_assert!(self.position + 2 <= self.limit); - let bytes_slice = &self.segment[self.position..self.position + 2]; - let byte_array: [u8; 2] = bytes_slice - .try_into() - .expect("Slice must be exactly 2 bytes long"); - - self.position += 2; - i16::from_ne_bytes(byte_array) + pub fn read_short(&self, pos: usize) -> (i16, usize) { + let next_pos = pos + 2; + debug_assert!(next_pos <= self.limit); + let bytes_slice = &self.segment[pos..pos + 2]; + let val = i16::from_ne_bytes( + bytes_slice + .try_into() + .expect("Slice must be exactly 2 bytes long"), + ); + (val, next_pos) } - pub fn read_int(&mut self) -> i32 { + pub fn read_int(&self, mut pos: usize) -> (i32, usize) { let mut result: u32 = 0; let mut shift = 0; for _ in 0..CompactedRowWriter::MAX_INT_SIZE { - let b = self.read_byte(); + let (b, next_pos) = self.read_byte(pos); + pos = next_pos; result |= ((b & 0x7F) as u32) << shift; if (b & 0x80) == 0 { - return result as i32; + return (result as i32, pos); } shift += 7; } - - panic!("Invalid input stream."); + panic!("Invalid VarInt32 input stream."); } - pub fn read_long(&mut self) -> i64 { + pub fn read_long(&self, mut pos: usize) -> (i64, usize) { let mut result: u64 = 0; let mut shift = 0; for _ in 0..CompactedRowWriter::MAX_LONG_SIZE { - let b = self.read_byte(); + let (b, next_pos) = self.read_byte(pos); + pos = next_pos; result |= ((b & 0x7F) as u64) << shift; if (b & 0x80) == 0 { - return result as i64; + return (result as i64, pos); } shift += 7; } - - panic!("Invalid input stream."); + panic!("Invalid VarInt64 input stream."); } - pub fn read_float(&mut self) -> f32 { - debug_assert!(self.position + 4 <= self.limit); - let bytes_slice = &self.segment[self.position..self.position + 4]; - let byte_array: [u8; 4] = bytes_slice - .try_into() - .expect("Slice must be exactly 4 bytes long"); - - self.position += 4; - f32::from_ne_bytes(byte_array) + pub fn read_float(&self, pos: usize) -> (f32, usize) { + let next_pos = pos + 4; + debug_assert!(next_pos <= self.limit); + let val = f32::from_ne_bytes( + self.segment[pos..pos + 4] + .try_into() + .expect("Slice must be exactly 4 bytes long"), + ); + (val, next_pos) } - pub fn read_double(&mut self) -> f64 { - debug_assert!(self.position + 8 <= self.limit); - let bytes_slice = &self.segment[self.position..self.position + 8]; - let byte_array: [u8; 8] = bytes_slice - .try_into() - .expect("Slice must be exactly 8 bytes long"); - - self.position += 8; - f64::from_ne_bytes(byte_array) + pub fn read_double(&self, pos: usize) -> (f64, usize) { + let next_pos = pos + 8; + debug_assert!(next_pos <= self.limit); + let val = f64::from_ne_bytes( + self.segment[pos..pos + 8] + .try_into() + .expect("Slice must be exactly 8 bytes long"), + ); + (val, next_pos) } - pub fn read_binary(&mut self, length: usize) -> Bytes { - debug_assert!(self.position + length <= self.limit); - - let start = self.position; - let end = start + length; - self.position = end; - - self.segment.slice(start..end) + pub fn read_binary(&self, pos: usize) -> (&'a [u8], usize) { + self.read_bytes(pos) } - pub fn read_bytes(&mut self) -> Box<[u8]> { - let len = self.read_int(); - debug_assert!(len >= 0); - + pub fn read_bytes(&self, pos: usize) -> (&'a [u8], usize) { + let (len, data_pos) = self.read_int(pos); let len = len as usize; - debug_assert!(self.position + len <= self.limit); - - let start = self.position; - let end = start + len; - self.position = end; - - self.segment[start..end].to_vec().into_boxed_slice() + let next_pos = data_pos + len; + debug_assert!(next_pos <= self.limit); + (&self.segment[data_pos..next_pos], next_pos) } - pub fn read_string(&mut self) -> String { - let bytes = self.read_bytes(); - String::from_utf8(bytes.into_vec()) - .unwrap_or_else(|e| panic!("Invalid UTF-8 in string data: {e}")) + pub fn read_string(&self, pos: usize) -> (&'a str, usize) { + let (bytes, next_pos) = self.read_bytes(pos); + let s = from_utf8(bytes).expect("Invalid UTF-8 when reading string"); + (s, next_pos) } } diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs index 834512350c..4f535c6bf0 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs @@ -18,7 +18,7 @@ use bytes::{Bytes, BytesMut}; use std::cmp; -use crate::row::compacted::compacted_row::CompactedRow; +use crate::row::compacted::compacted_row::calculate_bit_set_width_in_bytes; // Writer for CompactedRow // Reference implementation: @@ -36,7 +36,7 @@ impl CompactedRowWriter { pub const MAX_LONG_SIZE: usize = 10; pub fn new(field_count: usize) -> Self { - let header_size = CompactedRow::calculate_bit_set_width_in_bytes(field_count); + let header_size = calculate_bit_set_width_in_bytes(field_count); let cap = cmp::max(64, header_size); let mut buffer = BytesMut::with_capacity(cap); diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs index fa85ded4b8..ad7948dcef 100644 --- a/fluss-rust/crates/fluss/src/row/datum.rs +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -403,7 +403,7 @@ mod tests { #[test] fn datum_accessors_and_conversions() { - let datum = Datum::String("value"); + let datum = Datum::String("value".into()); assert_eq!(datum.as_str(), "value"); assert!(!datum.is_null()); From 979c0d4285a59470adf5873592c98547b626f01e Mon Sep 17 00:00:00 2001 From: Anton Borisov <148864665+fresh-borzoni@users.noreply.github.com> Date: Mon, 12 Jan 2026 02:18:27 +0000 Subject: [PATCH 061/287] chore: fix potential deadlock with holding lock during await (#145) --- .../fluss/src/client/write/accumulator.rs | 195 +++++++++++------- 1 file changed, 121 insertions(+), 74 deletions(-) diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs index 001d0aa7d0..74aab9f4fa 100644 --- a/fluss-rust/crates/fluss/src/client/write/accumulator.rs +++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs @@ -31,6 +31,9 @@ use std::sync::Arc; use std::sync::atomic::{AtomicI32, AtomicI64, Ordering}; use tokio::sync::Mutex; +// Type alias to simplify complex nested types +type BucketBatches = Vec<(BucketId, Arc>>)>; + #[allow(dead_code)] pub struct RecordAccumulator { config: Config, @@ -138,20 +141,25 @@ impl RecordAccumulator { abort_if_batch_full: bool, ) -> Result { let table_path = &record.table_path; - let mut binding = self - .write_batches - .entry(table_path.as_ref().clone()) - .or_insert_with(|| BucketAndWriteBatches { - table_id: 0, - is_partitioned_table: false, - partition_id: None, - batches: Default::default(), - }); - let bucket_and_batches = binding.value_mut(); - let dq = bucket_and_batches - .batches - .entry(bucket_id) - .or_insert_with(|| Mutex::new(VecDeque::new())); + + let dq = { + let mut binding = self + .write_batches + .entry(table_path.as_ref().clone()) + .or_insert_with(|| BucketAndWriteBatches { + table_id: 0, + is_partitioned_table: false, + partition_id: None, + batches: Default::default(), + }); + let bucket_and_batches = binding.value_mut(); + bucket_and_batches + .batches + .entry(bucket_id) + .or_insert_with(|| Arc::new(Mutex::new(VecDeque::new()))) + .clone() + }; + let mut dq_guard = dq.lock().await; if let Some(append_result) = self.try_append(record, &mut dq_guard)? { return Ok(append_result); @@ -166,16 +174,31 @@ impl RecordAccumulator { } pub async fn ready(&self, cluster: &Arc) -> ReadyCheckResult { + // Snapshot just the Arcs we need, avoiding cloning the entire BucketAndWriteBatches struct + let entries: Vec<(TablePath, BucketBatches)> = self + .write_batches + .iter() + .map(|entry| { + let table_path = entry.key().clone(); + let bucket_batches: Vec<_> = entry + .value() + .batches + .iter() + .map(|(bucket_id, batch_arc)| (*bucket_id, batch_arc.clone())) + .collect(); + (table_path, bucket_batches) + }) + .collect(); + let mut ready_nodes = HashSet::new(); let mut next_ready_check_delay_ms = self.batch_timeout_ms; let mut unknown_leader_tables = HashSet::new(); - for entry in self.write_batches.iter() { - let table_path = entry.key(); - let batches = entry.value(); + + for (table_path, bucket_batches) in entries { next_ready_check_delay_ms = self .bucket_ready( - table_path, - batches, + &table_path, + bucket_batches, &mut ready_nodes, &mut unknown_leader_tables, cluster, @@ -194,7 +217,7 @@ impl RecordAccumulator { async fn bucket_ready( &self, table_path: &TablePath, - batches: &BucketAndWriteBatches, + bucket_batches: BucketBatches, ready_nodes: &mut HashSet, unknown_leader_tables: &mut HashSet, cluster: &Cluster, @@ -202,7 +225,7 @@ impl RecordAccumulator { ) -> i64 { let mut next_delay = next_ready_check_delay_ms; - for (bucket_id, batch) in batches.batches.iter() { + for (bucket_id, batch) in bucket_batches { let batch_guard = batch.lock().await; if batch_guard.is_empty() { continue; @@ -212,7 +235,7 @@ impl RecordAccumulator { let waited_time_ms = batch.waited_time_ms(current_time_ms()); let deque_size = batch_guard.len(); let full = deque_size > 1 || batch.is_closed(); - let table_bucket = cluster.get_table_bucket(table_path, *bucket_id); + let table_bucket = cluster.get_table_bucket(table_path, bucket_id); if let Some(leader) = cluster.leader_for(&table_bucket) { next_delay = self.batch_ready(leader, waited_time_ms, full, ready_nodes, next_delay); @@ -281,60 +304,77 @@ impl RecordAccumulator { return Ok(ready); } - let mut nodes_drain_index_guard = self.nodes_drain_index.lock().await; - let drain_index = nodes_drain_index_guard.entry(node.id()).or_insert(0); - let start = *drain_index % buckets.len(); + // Get the start index without holding the lock across awaits + let start = { + let mut nodes_drain_index_guard = self.nodes_drain_index.lock().await; + let drain_index = nodes_drain_index_guard.entry(node.id()).or_insert(0); + *drain_index % buckets.len() + }; + let mut current_index = start; + // Assigned at the start of each loop iteration (line 323), used after loop (line 376) + let mut last_processed_index; loop { let bucket = &buckets[current_index]; let table_path = bucket.table_path.clone(); let table_bucket = bucket.table_bucket.clone(); - nodes_drain_index_guard.insert(node.id(), current_index); + last_processed_index = current_index; current_index = (current_index + 1) % buckets.len(); - let bucket_and_write_batches = self.write_batches.get(&table_path); - if let Some(bucket_and_write_batches) = bucket_and_write_batches { - if let Some(deque) = bucket_and_write_batches - .batches - .get(&table_bucket.bucket_id()) + let deque = self + .write_batches + .get(&table_path) + .and_then(|bucket_and_write_batches| { + bucket_and_write_batches + .batches + .get(&table_bucket.bucket_id()) + .cloned() + }); + + if let Some(deque) = deque { + let mut maybe_batch = None; { - let mut maybe_batch = None; - { - let mut batch_lock = deque.lock().await; - if !batch_lock.is_empty() { - let first_batch = batch_lock.front().unwrap(); - - if size + first_batch.estimated_size_in_bytes() > max_size as i64 - && !ready.is_empty() - { - // there is a rare case that a single batch size is larger than the request size - // due to compression; in this case we will still eventually send this batch in - // a single request. - break; - } - - maybe_batch = Some(batch_lock.pop_front().unwrap()); + let mut batch_lock = deque.lock().await; + if !batch_lock.is_empty() { + let first_batch = batch_lock.front().unwrap(); + + if size + first_batch.estimated_size_in_bytes() > max_size as i64 + && !ready.is_empty() + { + // there is a rare case that a single batch size is larger than the request size + // due to compression; in this case we will still eventually send this batch in + // a single request. + break; } + + maybe_batch = Some(batch_lock.pop_front().unwrap()); } + } - if let Some(mut batch) = maybe_batch { - let current_batch_size = batch.estimated_size_in_bytes(); - size += current_batch_size; + if let Some(mut batch) = maybe_batch { + let current_batch_size = batch.estimated_size_in_bytes(); + size += current_batch_size; - // mark the batch as drained. - batch.drained(current_time_ms()); - ready.push(ReadyWriteBatch { - table_bucket, - write_batch: batch, - }); - } + // mark the batch as drained. + batch.drained(current_time_ms()); + ready.push(ReadyWriteBatch { + table_bucket, + write_batch: batch, + }); } } if current_index == start { break; } } + + // Store the last processed index to maintain round-robin fairness + { + let mut nodes_drain_index_guard = self.nodes_drain_index.lock().await; + nodes_drain_index_guard.insert(node.id(), last_processed_index); + } + Ok(ready) } @@ -347,20 +387,25 @@ impl RecordAccumulator { let table_path = ready_write_batch.write_batch.table_path().clone(); let bucket_id = ready_write_batch.table_bucket.bucket_id(); let table_id = u64::try_from(ready_write_batch.table_bucket.table_id()).unwrap_or(0); - let mut binding = - self.write_batches - .entry(table_path) - .or_insert_with(|| BucketAndWriteBatches { - table_id, - is_partitioned_table: false, - partition_id: None, - batches: Default::default(), - }); - let bucket_and_batches = binding.value_mut(); - let dq = bucket_and_batches - .batches - .entry(bucket_id) - .or_insert_with(|| Mutex::new(VecDeque::new())); + + let dq = { + let mut binding = + self.write_batches + .entry(table_path) + .or_insert_with(|| BucketAndWriteBatches { + table_id, + is_partitioned_table: false, + partition_id: None, + batches: Default::default(), + }); + let bucket_and_batches = binding.value_mut(); + bucket_and_batches + .batches + .entry(bucket_id) + .or_insert_with(|| Arc::new(Mutex::new(VecDeque::new()))) + .clone() + }; + let mut dq_guard = dq.lock().await; dq_guard.push_front(ready_write_batch.write_batch); } @@ -392,9 +437,11 @@ impl RecordAccumulator { } #[allow(unused_must_use)] - #[allow(clippy::await_holding_lock)] pub async fn await_flush_completion(&self) -> Result<()> { - for result_handle in self.incomplete_batches.read().values() { + // Clone handles before awaiting to avoid holding RwLock read guard across await points + let handles: Vec<_> = self.incomplete_batches.read().values().cloned().collect(); + + for result_handle in handles { result_handle.wait().await?; } Ok(()) @@ -411,7 +458,7 @@ struct BucketAndWriteBatches { table_id: TableId, is_partitioned_table: bool, partition_id: Option, - batches: HashMap>>, + batches: HashMap>>>, } pub struct RecordAppendResult { From 63509b155d44ceae8f5195c33e31edbae45622b3 Mon Sep 17 00:00:00 2001 From: Anton Borisov <148864665+fresh-borzoni@users.noreply.github.com> Date: Mon, 12 Jan 2026 07:12:27 +0000 Subject: [PATCH 062/287] chore: decrement flushes_in_progress counter in await_flush_comp (#147) --- .../fluss/src/client/write/accumulator.rs | 47 +++++++++++++++++-- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs index 74aab9f4fa..83f11ab782 100644 --- a/fluss-rust/crates/fluss/src/client/write/accumulator.rs +++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs @@ -441,10 +441,20 @@ impl RecordAccumulator { // Clone handles before awaiting to avoid holding RwLock read guard across await points let handles: Vec<_> = self.incomplete_batches.read().values().cloned().collect(); - for result_handle in handles { - result_handle.wait().await?; + // Await on all handles + let result = async { + for result_handle in handles { + result_handle.wait().await?; + } + Ok(()) } - Ok(()) + .await; + + // Always decrement flushes_in_progress, even if an error occurred + // This mimics the Java finally block behavior + self.flushes_in_progress.fetch_sub(1, Ordering::SeqCst); + + result } } @@ -557,4 +567,35 @@ mod tests { assert_eq!(batch.write_batch.attempts(), 1); Ok(()) } + + #[tokio::test] + async fn flush_counter_decremented_on_error() -> Result<()> { + use crate::client::write::broadcast::BroadcastOnce; + use std::sync::atomic::Ordering; + + let config = Config::default(); + let accumulator = RecordAccumulator::new(config); + + accumulator.begin_flush(); + assert_eq!(accumulator.flushes_in_progress.load(Ordering::SeqCst), 1); + + // Create a failing batch by dropping the BroadcastOnce without broadcasting + { + let broadcast = BroadcastOnce::default(); + let receiver = broadcast.receiver(); + let handle = ResultHandle::new(receiver); + accumulator.incomplete_batches.write().insert(1, handle); + // broadcast is dropped here, causing an error + } + + // Await flush completion should fail but still decrement counter + let result = accumulator.await_flush_completion().await; + assert!(result.is_err()); + + // Counter should still be decremented (this is the critical fix!) + assert_eq!(accumulator.flushes_in_progress.load(Ordering::SeqCst), 0); + assert!(!accumulator.flush_in_progress()); + + Ok(()) + } } From 3cdc49526ee2a8159df976f49944ca55251972b9 Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Wed, 14 Jan 2026 01:42:19 +0000 Subject: [PATCH 063/287] chore: add DEVELOPMENT.md (#155) --- fluss-rust/DEVELOPMENT.md | 115 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 fluss-rust/DEVELOPMENT.md diff --git a/fluss-rust/DEVELOPMENT.md b/fluss-rust/DEVELOPMENT.md new file mode 100644 index 0000000000..a8a6d5380c --- /dev/null +++ b/fluss-rust/DEVELOPMENT.md @@ -0,0 +1,115 @@ + + +# Development Guide + +Welcome to the development guide of `fluss-rust`! This project builds `fluss-rust` client and language specific bindings. + +## Pre-requisites + +- protobuf +- rust + +You can install these using your favourite package / version manager. Example installation using mise: + +```bash +mise install protobuf +mise install rust +``` + +## IDE Setup + +We recommend [RustRover](https://www.jetbrains.com/rust/) IDE to work with fluss-rust code base. + +### Importing fluss-rust + +1. On your terminal, clone fluss-rust project from GitHub + ```bash + git clone https://github.com/apache/fluss-rust.git + ``` +1. Open RustRover, on `Projects` tab, click `Open` and navigate to the root directory of fluss-rust +1. Click `Open` + +### Copyright Profile + +Fluss and Fluss-rust are Apache projects and as such every files need to have Apache licence header. This can be automated in RustRover by adding a Copyright profile: + +1. Go to `Settings` -> `Editor` -> `Copyright` -> `Copyright Profiles`. +1. Add a new profile and name it `Apache`. +1. Add the following text as the license text: + ``` + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + ``` +1. Go to `Editor` -> `Copyright` and choose the `Apache` profile as the default profile for this project. +1. Click `Apply` + +We also use line comment formatting for licence headers. +1. Go to `Editor` -> `Copyright` -> `Formatting` -> `Rust` +1. Choose `Use custom formatting` +1. Choose `Use line comment` + +## Project directories + +Source files are organized in the following manner + +1. `crates/fluss` - fluss rust client crate source +1. `crates/examples` - fluss rust client examples +1. `bindings` - bindings to other languages e.g. C++ under `bindings/cpp` and Python under `bindings/python` +1. Click `Apply` +2. +## Building & Testing + +See [quickstart](README.md#quick-start) for steps to run example code. + +Running all unit tests for fluss rust client: + +```bash +cargo test --workspace +``` + +Running all integration test cases: + +```bash +cargo test --features integration_tests --workspace +``` + + +### Formatting and Clippy + +Our CI runs cargo formatting and clippy to help keep the code base styling tidy and readable. Run the following commands and address any errors or warnings to ensure that your PR can complete CI successfully. + +```bash +cargo fmt --all +cargo clippy --all-targets --fix --allow-dirty --allow-staged +``` + From d8fdcc7a3589991b1cae9fc5e70addcfc3ff9e12 Mon Sep 17 00:00:00 2001 From: Anton Borisov <148864665+fresh-borzoni@users.noreply.github.com> Date: Thu, 15 Jan 2026 02:51:57 +0000 Subject: [PATCH 064/287] feat: introduce KvRecordBatchBuilder (#156) --- .../crates/fluss/src/record/kv/kv_record.rs | 343 +++++++++++ .../fluss/src/record/kv/kv_record_batch.rs | 394 ++++++++++++ .../src/record/kv/kv_record_batch_builder.rs | 581 ++++++++++++++++++ fluss-rust/crates/fluss/src/record/kv/mod.rs | 35 ++ fluss-rust/crates/fluss/src/record/mod.rs | 1 + .../src/row/compacted/compacted_key_writer.rs | 6 + .../src/row/compacted/compacted_row_reader.rs | 35 +- .../src/row/compacted/compacted_row_writer.rs | 31 +- fluss-rust/crates/fluss/src/row/mod.rs | 7 +- fluss-rust/crates/fluss/src/util/mod.rs | 1 + fluss-rust/crates/fluss/src/util/varint.rs | 502 +++++++++++++++ 11 files changed, 1893 insertions(+), 43 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/record/kv/kv_record.rs create mode 100644 fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs create mode 100644 fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs create mode 100644 fluss-rust/crates/fluss/src/record/kv/mod.rs create mode 100644 fluss-rust/crates/fluss/src/util/varint.rs diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record.rs new file mode 100644 index 0000000000..8c30713d42 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record.rs @@ -0,0 +1,343 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Key-Value record implementation. +//! +//! This module provides the KvRecord struct which represents an immutable key-value record. +//! The record format is: +//! - Length => Int32 +//! - KeyLength => Unsigned VarInt +//! - Key => bytes +//! - Row => BinaryRow (optional, if null then this is a deletion record) + +use bytes::{BufMut, Bytes, BytesMut}; +use std::io; + +use crate::util::varint::{ + read_unsigned_varint_bytes, size_of_unsigned_varint, write_unsigned_varint_buf, +}; + +/// Length field size in bytes +pub const LENGTH_LENGTH: usize = 4; + +/// A key-value record. +/// +/// The schema is: +/// - Length => Int32 +/// - KeyLength => Unsigned VarInt +/// - Key => bytes +/// - Value => bytes (BinaryRow, written directly without length prefix) +/// +/// When the value is None (deletion), no Value bytes are present. +// Reference implementation: +// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/record/KvRecord.java +#[derive(Debug, Clone)] +pub struct KvRecord { + key: Bytes, + value: Option, + size_in_bytes: usize, +} + +impl KvRecord { + /// Create a new KvRecord with the given key and optional value. + pub fn new(key: Bytes, value: Option) -> Self { + let size_in_bytes = Self::size_of(&key, value.as_deref()); + Self { + key, + value, + size_in_bytes, + } + } + + /// Get the key bytes. + pub fn key(&self) -> &Bytes { + &self.key + } + + /// Get the value bytes (None indicates a deletion). + pub fn value(&self) -> Option<&Bytes> { + self.value.as_ref() + } + + /// Calculate the total size of the record when serialized (including length prefix). + pub fn size_of(key: &[u8], value: Option<&[u8]>) -> usize { + Self::size_without_length(key, value) + LENGTH_LENGTH + } + + /// Calculate the size without the length prefix. + fn size_without_length(key: &[u8], value: Option<&[u8]>) -> usize { + let key_len = key.len(); + let key_len_size = size_of_unsigned_varint(key_len as u32); + + match value { + Some(v) => key_len_size.saturating_add(key_len).saturating_add(v.len()), + None => { + // Deletion: no value bytes + key_len_size.saturating_add(key_len) + } + } + } + + /// Write a KV record to a buffer. + /// + /// Returns the number of bytes written. + pub fn write_to_buf(buf: &mut BytesMut, key: &[u8], value: Option<&[u8]>) -> io::Result { + let size_in_bytes = Self::size_without_length(key, value); + + let size_i32 = i32::try_from(size_in_bytes).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("Record size {} exceeds i32::MAX", size_in_bytes), + ) + })?; + buf.put_i32_le(size_i32); + let key_len = key.len() as u32; + write_unsigned_varint_buf(key_len, buf); + + buf.put_slice(key); + + if let Some(v) = value { + buf.put_slice(v); + } + // For None (deletion), don't write any value bytes + + Ok(size_in_bytes + LENGTH_LENGTH) + } + + /// Read a KV record from bytes at the given position. + /// + /// Returns the KvRecord and the number of bytes consumed. + /// + /// TODO: Connect KvReadContext and return CompactedRow records. + pub fn read_from(bytes: &Bytes, position: usize) -> io::Result<(Self, usize)> { + if bytes.len() < position.saturating_add(LENGTH_LENGTH) { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read record length", + )); + } + + let size_in_bytes_i32 = i32::from_le_bytes([ + bytes[position], + bytes[position + 1], + bytes[position + 2], + bytes[position + 3], + ]); + + if size_in_bytes_i32 < 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Invalid record length: {}", size_in_bytes_i32), + )); + } + + let size_in_bytes = size_in_bytes_i32 as usize; + + let total_size = size_in_bytes.checked_add(LENGTH_LENGTH).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + format!( + "Record size overflow: {} + {}", + size_in_bytes, LENGTH_LENGTH + ), + ) + })?; + + let available = bytes.len().saturating_sub(position); + if available < total_size { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!( + "Not enough bytes to read record: expected {}, available {}", + total_size, available + ), + )); + } + + let mut current_offset = position + LENGTH_LENGTH; + let record_end = position + total_size; + + // Read key length as unsigned varint (bounded by record end) + let (key_len, varint_size) = + read_unsigned_varint_bytes(&bytes[current_offset..record_end])?; + current_offset += varint_size; + + // Read key bytes + let key_end = current_offset + key_len as usize; + if key_end > position + total_size { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Key length exceeds record size", + )); + } + let key = bytes.slice(current_offset..key_end); + current_offset = key_end; + + // Read value bytes directly + let value = if current_offset < record_end { + // Value is present: all remaining bytes are the value + let value_bytes = bytes.slice(current_offset..record_end); + Some(value_bytes) + } else { + // No remaining bytes: this is a deletion record + None + }; + + Ok(( + Self { + key, + value, + size_in_bytes: total_size, + }, + total_size, + )) + } + + /// Get the total size in bytes of this record. + pub fn get_size_in_bytes(&self) -> usize { + self.size_in_bytes + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_kv_record_size_calculation() { + let key = b"test_key"; + let value = b"test_value"; + + // With value (no value length varint) + let size_with_value = KvRecord::size_of(key, Some(value)); + assert_eq!( + size_with_value, + LENGTH_LENGTH + size_of_unsigned_varint(key.len() as u32) + key.len() + value.len() + ); + + // Without value + let size_without_value = KvRecord::size_of(key, None); + assert_eq!( + size_without_value, + LENGTH_LENGTH + size_of_unsigned_varint(key.len() as u32) + key.len() + ); + } + + #[test] + fn test_kv_record_write_read_round_trip() { + let key = b"my_key"; + let value = b"my_value_data"; + + let mut buf = BytesMut::new(); + let written = KvRecord::write_to_buf(&mut buf, key, Some(value)).unwrap(); + + let bytes = buf.freeze(); + let (record, read_size) = KvRecord::read_from(&bytes, 0).unwrap(); + + assert_eq!(written, read_size); + assert_eq!(record.key().as_ref(), key); + assert_eq!(record.value().unwrap().as_ref(), value); + assert_eq!(record.get_size_in_bytes(), written); + } + + #[test] + fn test_kv_record_deletion() { + let key = b"delete_me"; + + // Write deletion record (no value) + let mut buf = BytesMut::new(); + let written = KvRecord::write_to_buf(&mut buf, key, None).unwrap(); + + let bytes = buf.freeze(); + let (record, read_size) = KvRecord::read_from(&bytes, 0).unwrap(); + + assert_eq!(written, read_size); + assert_eq!(record.key().as_ref(), key); + assert!(record.value().is_none()); + } + + #[test] + fn test_kv_record_with_large_key() { + let key = vec![0u8; 1024]; + let value = vec![1u8; 4096]; + + let mut buf = BytesMut::new(); + let written = KvRecord::write_to_buf(&mut buf, &key, Some(&value)).unwrap(); + + let bytes = buf.freeze(); + let (record, read_size) = KvRecord::read_from(&bytes, 0).unwrap(); + + assert_eq!(written, read_size); + assert_eq!(record.key().len(), key.len()); + assert_eq!(record.value().unwrap().len(), value.len()); + } + + #[test] + fn test_invalid_record_lengths() { + let mut buf = BytesMut::new(); + buf.put_i32_le(-1); // Negative length + buf.put_u8(1); // Some dummy data + buf.put_slice(b"key"); + let bytes = buf.freeze(); + let result = KvRecord::read_from(&bytes, 0); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidData); + + // Test overflow length + let mut buf = BytesMut::new(); + buf.put_i32_le(i32::MAX); // Very large length + buf.put_u8(1); // Some dummy data + let bytes = buf.freeze(); + let result = KvRecord::read_from(&bytes, 0); + assert!(result.is_err()); + + // Test impossibly large but non-negative length + let mut buf = BytesMut::new(); + buf.put_i32_le(1_000_000); + let bytes = buf.freeze(); + let result = KvRecord::read_from(&bytes, 0); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().kind(), io::ErrorKind::UnexpectedEof); + } + + #[test] + fn test_multiple_records_in_buffer() { + let records = vec![ + (b"key1".as_slice(), Some(b"value1".as_slice())), + (b"key2".as_slice(), None), + (b"key3".as_slice(), Some(b"value3".as_slice())), + ]; + + let mut buf = BytesMut::new(); + for (key, value) in &records { + KvRecord::write_to_buf(&mut buf, key, *value).unwrap(); + } + + let bytes = buf.freeze(); + let mut offset = 0; + for (expected_key, expected_value) in &records { + let (record, size) = KvRecord::read_from(&bytes, offset).unwrap(); + assert_eq!(record.key().as_ref(), *expected_key); + match expected_value { + Some(v) => assert_eq!(record.value().unwrap().as_ref(), *v), + None => assert!(record.value().is_none()), + } + offset += size; + } + assert_eq!(offset, bytes.len()); + } +} diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs new file mode 100644 index 0000000000..fdd4ad7322 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs @@ -0,0 +1,394 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! KV record batch implementation. +//! +//! The schema of a KvRecordBatch is: +//! - Length => Int32 +//! - Magic => Int8 +//! - CRC => Uint32 +//! - SchemaId => Int16 +//! - Attributes => Int8 +//! - WriterId => Int64 +//! - BatchSequence => Int32 +//! - RecordCount => Int32 +//! - Records => [Record] +//! +//! The CRC covers data from the SchemaId to the end of the batch. + +use bytes::Bytes; +use std::io; + +use crate::record::kv::KvRecord; + +// Field lengths in bytes +pub const LENGTH_LENGTH: usize = 4; +pub const MAGIC_LENGTH: usize = 1; +pub const CRC_LENGTH: usize = 4; +pub const SCHEMA_ID_LENGTH: usize = 2; +pub const ATTRIBUTE_LENGTH: usize = 1; +pub const WRITE_CLIENT_ID_LENGTH: usize = 8; +pub const BATCH_SEQUENCE_LENGTH: usize = 4; +pub const RECORDS_COUNT_LENGTH: usize = 4; + +// Field offsets +pub const LENGTH_OFFSET: usize = 0; +pub const MAGIC_OFFSET: usize = LENGTH_OFFSET + LENGTH_LENGTH; +pub const CRC_OFFSET: usize = MAGIC_OFFSET + MAGIC_LENGTH; +pub const SCHEMA_ID_OFFSET: usize = CRC_OFFSET + CRC_LENGTH; +pub const ATTRIBUTES_OFFSET: usize = SCHEMA_ID_OFFSET + SCHEMA_ID_LENGTH; +pub const WRITE_CLIENT_ID_OFFSET: usize = ATTRIBUTES_OFFSET + ATTRIBUTE_LENGTH; +pub const BATCH_SEQUENCE_OFFSET: usize = WRITE_CLIENT_ID_OFFSET + WRITE_CLIENT_ID_LENGTH; +pub const RECORDS_COUNT_OFFSET: usize = BATCH_SEQUENCE_OFFSET + BATCH_SEQUENCE_LENGTH; +pub const RECORDS_OFFSET: usize = RECORDS_COUNT_OFFSET + RECORDS_COUNT_LENGTH; + +/// Total header size +pub const RECORD_BATCH_HEADER_SIZE: usize = RECORDS_OFFSET; + +/// Overhead of the batch (length field) +pub const KV_OVERHEAD: usize = LENGTH_OFFSET + LENGTH_LENGTH; + +/// A KV record batch. +/// +/// This struct provides read access to a serialized KV record batch. +// Reference implementation: +// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/record/KvRecordBatch.java +pub struct KvRecordBatch { + data: Bytes, + position: usize, +} + +impl KvRecordBatch { + /// Create a new KvRecordBatch pointing to the given data at the specified position. + pub fn new(data: Bytes, position: usize) -> Self { + Self { data, position } + } + + /// Get the size in bytes of this batch. + pub fn size_in_bytes(&self) -> io::Result { + if self.data.len() < self.position.saturating_add(LENGTH_LENGTH) { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read batch length", + )); + } + let length_i32 = i32::from_le_bytes([ + self.data[self.position], + self.data[self.position + 1], + self.data[self.position + 2], + self.data[self.position + 3], + ]); + + if length_i32 < 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Invalid batch length: {}", length_i32), + )); + } + + let length = length_i32 as usize; + + Ok(length.saturating_add(KV_OVERHEAD)) + } + + /// Check if this batch is valid by verifying the checksum. + pub fn is_valid(&self) -> bool { + if !matches!(self.size_in_bytes(), Ok(s) if s >= RECORD_BATCH_HEADER_SIZE) { + return false; + } + + match (self.checksum(), self.compute_checksum()) { + (Ok(stored), Ok(computed)) => stored == computed, + _ => false, + } + } + + /// Get the magic byte. + pub fn magic(&self) -> io::Result { + if self.data.len() < self.position.saturating_add(MAGIC_OFFSET).saturating_add(1) { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read magic byte", + )); + } + Ok(self.data[self.position + MAGIC_OFFSET]) + } + + /// Get the checksum. + pub fn checksum(&self) -> io::Result { + if self.data.len() < self.position.saturating_add(CRC_OFFSET).saturating_add(4) { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read checksum", + )); + } + Ok(u32::from_le_bytes([ + self.data[self.position + CRC_OFFSET], + self.data[self.position + CRC_OFFSET + 1], + self.data[self.position + CRC_OFFSET + 2], + self.data[self.position + CRC_OFFSET + 3], + ])) + } + + /// Compute the checksum of this batch. + pub fn compute_checksum(&self) -> io::Result { + let size = self.size_in_bytes()?; + if size < RECORD_BATCH_HEADER_SIZE { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "Batch size {} is less than header size {}", + size, RECORD_BATCH_HEADER_SIZE + ), + )); + } + + let start = self.position.saturating_add(SCHEMA_ID_OFFSET); + let end = self.position.saturating_add(size); + + if end > self.data.len() || start >= end { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to compute checksum", + )); + } + + Ok(crc32c::crc32c(&self.data[start..end])) + } + + /// Get the schema ID. + pub fn schema_id(&self) -> io::Result { + if self.data.len() + < self + .position + .saturating_add(SCHEMA_ID_OFFSET) + .saturating_add(2) + { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read schema ID", + )); + } + Ok(i16::from_le_bytes([ + self.data[self.position + SCHEMA_ID_OFFSET], + self.data[self.position + SCHEMA_ID_OFFSET + 1], + ])) + } + + /// Get the writer ID. + pub fn writer_id(&self) -> io::Result { + if self.data.len() + < self + .position + .saturating_add(WRITE_CLIENT_ID_OFFSET) + .saturating_add(8) + { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read writer ID", + )); + } + Ok(i64::from_le_bytes([ + self.data[self.position + WRITE_CLIENT_ID_OFFSET], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 1], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 2], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 3], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 4], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 5], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 6], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 7], + ])) + } + + /// Get the batch sequence. + pub fn batch_sequence(&self) -> io::Result { + if self.data.len() + < self + .position + .saturating_add(BATCH_SEQUENCE_OFFSET) + .saturating_add(4) + { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read batch sequence", + )); + } + Ok(i32::from_le_bytes([ + self.data[self.position + BATCH_SEQUENCE_OFFSET], + self.data[self.position + BATCH_SEQUENCE_OFFSET + 1], + self.data[self.position + BATCH_SEQUENCE_OFFSET + 2], + self.data[self.position + BATCH_SEQUENCE_OFFSET + 3], + ])) + } + + /// Get the number of records in this batch. + pub fn record_count(&self) -> io::Result { + if self.data.len() + < self + .position + .saturating_add(RECORDS_COUNT_OFFSET) + .saturating_add(4) + { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read record count", + )); + } + Ok(i32::from_le_bytes([ + self.data[self.position + RECORDS_COUNT_OFFSET], + self.data[self.position + RECORDS_COUNT_OFFSET + 1], + self.data[self.position + RECORDS_COUNT_OFFSET + 2], + self.data[self.position + RECORDS_COUNT_OFFSET + 3], + ])) + } + + /// Create an iterator over the records in this batch. + /// This validates the batch checksum before returning the iterator. + /// For trusted data paths, use `records_unchecked()` to skip validation. + pub fn records(&self) -> io::Result { + if !self.is_valid() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid batch checksum", + )); + } + self.records_unchecked() + } + + /// Create an iterator over the records in this batch without validating the checksum + pub fn records_unchecked(&self) -> io::Result { + let size = self.size_in_bytes()?; + let count = self.record_count()?; + if count < 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Invalid record count: {}", count), + )); + } + Ok(KvRecordIterator { + data: self.data.clone(), + position: self.position + RECORDS_OFFSET, + end: self.position + size, + remaining_count: count, + }) + } +} + +/// Iterator over records in a KV record batch. +pub struct KvRecordIterator { + data: Bytes, + position: usize, + end: usize, + remaining_count: i32, +} + +impl Iterator for KvRecordIterator { + type Item = io::Result; + + fn next(&mut self) -> Option { + if self.remaining_count <= 0 || self.position >= self.end { + return None; + } + + match KvRecord::read_from(&self.data, self.position) { + Ok((record, size)) => { + self.position += size; + self.remaining_count -= 1; + Some(Ok(record)) + } + Err(e) => { + self.remaining_count = 0; // Stop iteration on error + Some(Err(e)) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::KvFormat; + use crate::record::kv::{CURRENT_KV_MAGIC_VALUE, KvRecordBatchBuilder}; + use bytes::{BufMut, BytesMut}; + + #[test] + fn test_invalid_batch_lengths() { + // Test negative length + let mut buf = BytesMut::new(); + buf.put_i32_le(-1); + let bytes = buf.freeze(); + let batch = KvRecordBatch::new(bytes, 0); + assert!(batch.size_in_bytes().is_err()); // Should error for invalid + assert!(!batch.is_valid()); + + // Test overflow length + let mut buf = BytesMut::new(); + buf.put_i32_le(i32::MAX); + let bytes = buf.freeze(); + let batch = KvRecordBatch::new(bytes, 0); + assert!(!batch.is_valid()); + + // Test too-short buffer + let mut buf = BytesMut::new(); + buf.put_i32_le(100); // Claims 100 bytes but buffer is tiny + let bytes = buf.freeze(); + let batch = KvRecordBatch::new(bytes, 0); + assert!(!batch.is_valid()); + } + + #[test] + fn test_kv_record_batch_build_and_read() { + use crate::row::compacted::CompactedRowWriter; + + let schema_id = 42; + let write_limit = 4096; + + let mut builder = KvRecordBatchBuilder::new(schema_id, write_limit, KvFormat::COMPACTED); + builder.set_writer_state(100, 5); + + let key1 = b"key1"; + let mut value1_writer = CompactedRowWriter::new(1); + value1_writer.write_bytes(&[1, 2, 3, 4, 5]); + builder.append_row(key1, Some(&value1_writer)).unwrap(); + + let key2 = b"key2"; + builder + .append_row::(key2, None) + .unwrap(); + + let bytes = builder.build().unwrap(); + + let batch = KvRecordBatch::new(bytes.clone(), 0); + assert!(batch.is_valid()); + assert_eq!(batch.magic().unwrap(), CURRENT_KV_MAGIC_VALUE); + assert_eq!(batch.schema_id().unwrap(), schema_id as i16); + assert_eq!(batch.writer_id().unwrap(), 100); + assert_eq!(batch.batch_sequence().unwrap(), 5); + assert_eq!(batch.record_count().unwrap(), 2); + + let records: Vec<_> = batch.records().unwrap().collect(); + assert_eq!(records.len(), 2); + + let record1 = records[0].as_ref().unwrap(); + assert_eq!(record1.key().as_ref(), key1); + assert_eq!(record1.value().unwrap().as_ref(), value1_writer.buffer()); + + let record2 = records[1].as_ref().unwrap(); + assert_eq!(record2.key().as_ref(), key2); + assert!(record2.value().is_none()); + } +} diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs new file mode 100644 index 0000000000..773c7789b5 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs @@ -0,0 +1,581 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! KV record batch builder implementation. +//! +//! This module provides the KvRecordBatchBuilder for building batches of KV records. + +use bytes::{Bytes, BytesMut}; +use std::io; + +use crate::metadata::KvFormat; +use crate::record::kv::kv_record::KvRecord; +use crate::record::kv::kv_record_batch::{ + ATTRIBUTES_OFFSET, BATCH_SEQUENCE_OFFSET, CRC_OFFSET, LENGTH_LENGTH, LENGTH_OFFSET, + MAGIC_OFFSET, RECORD_BATCH_HEADER_SIZE, RECORDS_COUNT_OFFSET, SCHEMA_ID_OFFSET, + WRITE_CLIENT_ID_OFFSET, +}; +use crate::record::kv::{CURRENT_KV_MAGIC_VALUE, NO_BATCH_SEQUENCE, NO_WRITER_ID}; +use crate::row::BinaryRow; + +/// Builder for KvRecordBatch. +/// +/// This builder accumulates KV records and produces a serialized batch with proper +/// header information and checksums. +// Reference implementation: +// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/record/KvRecordBatchBuilder.java +pub struct KvRecordBatchBuilder { + schema_id: i32, + magic: u8, + write_limit: usize, + buffer: BytesMut, + writer_id: i64, + batch_sequence: i32, + current_record_number: i32, + size_in_bytes: usize, + is_closed: bool, + kv_format: KvFormat, + aborted: bool, + built_buffer: Option, +} + +impl KvRecordBatchBuilder { + /// Create a new KvRecordBatchBuilder. + /// + /// # Arguments + /// * `schema_id` - The schema ID for records in this batch (must fit in i16) + /// * `write_limit` - Maximum bytes that can be appended + /// * `kv_format` - The KV format (Compacted, Indexed, or Aligned) + pub fn new(schema_id: i32, write_limit: usize, kv_format: KvFormat) -> Self { + assert!( + schema_id <= i16::MAX as i32, + "schema_id shouldn't be greater than the max value of i16: {}", + i16::MAX + ); + + let mut buffer = BytesMut::with_capacity(write_limit.max(RECORD_BATCH_HEADER_SIZE)); + + // Reserve space for header (we'll write it at the end) + buffer.resize(RECORD_BATCH_HEADER_SIZE, 0); + + Self { + schema_id, + magic: CURRENT_KV_MAGIC_VALUE, + write_limit, + buffer, + writer_id: NO_WRITER_ID, + batch_sequence: NO_BATCH_SEQUENCE, + current_record_number: 0, + size_in_bytes: RECORD_BATCH_HEADER_SIZE, + is_closed: false, + kv_format, + aborted: false, + built_buffer: None, + } + } + + /// Check if there is room for a new record containing the given key and row. + /// If no records have been appended, this always returns true. + pub fn has_room_for_row(&self, key: &[u8], row: Option<&R>) -> bool { + let value = row.map(|r| r.as_bytes()); + self.size_in_bytes + KvRecord::size_of(key, value) <= self.write_limit + } + + /// Append a KV record with a row value to the batch. + /// + /// Returns an error if: + /// - The builder has been aborted + /// - The builder is closed + /// - Adding this record would exceed the write limit + /// - The maximum number of records is exceeded + /// - The KV format is not COMPACTED + pub fn append_row(&mut self, key: &[u8], row: Option<&R>) -> io::Result<()> { + if self.kv_format != KvFormat::COMPACTED { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "append_row can only be used with KvFormat::COMPACTED", + )); + } + + if self.aborted { + return Err(io::Error::other( + "Tried to append a record, but KvRecordBatchBuilder has already been aborted", + )); + } + + if self.is_closed { + return Err(io::Error::other( + "Tried to append a record, but KvRecordBatchBuilder is closed for record appends", + )); + } + + // Check record count limit before mutation + if self.current_record_number == i32::MAX { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!( + "Maximum number of records per batch exceeded, max records: {}", + i32::MAX + ), + )); + } + + let value = row.map(|r| r.as_bytes()); + let record_size = KvRecord::size_of(key, value); + if self.size_in_bytes + record_size > self.write_limit { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + format!( + "Adding record would exceed write limit: {} + {} > {}", + self.size_in_bytes, record_size, self.write_limit + ), + )); + } + + let record_byte_size = KvRecord::write_to_buf(&mut self.buffer, key, value)?; + debug_assert_eq!(record_byte_size, record_size, "Record size mismatch"); + + self.current_record_number += 1; + self.size_in_bytes += record_byte_size; + + // Invalidate cached buffer since we modified the batch + self.built_buffer = None; + + Ok(()) + } + + /// Set the writer state (writer ID and batch base sequence). + /// + /// This invalidates any cached buffer, ensuring the batch header will be rebuilt + /// on the next call to [`build`](Self::build). + pub fn set_writer_state(&mut self, writer_id: i64, batch_base_sequence: i32) { + self.writer_id = writer_id; + self.batch_sequence = batch_base_sequence; + // Invalidate cached buffer since header fields changed + self.built_buffer = None; + } + + /// Build the batch and return the serialized bytes. + /// + /// This can be called multiple times as the batch is cached after the first build. + /// + /// # Caching and Mutations + /// + /// The builder caches the result after the first successful build. However, the cache + /// is invalidated (and the batch rebuilt) if any of the following occur after building: + /// - Calling [`append_row`](Self::append_row) to add records + /// - Calling [`set_writer_state`](Self::set_writer_state) to modify writer metadata + /// + /// This allows the builder to be reused with different writer states or to continue + /// appending records after an initial build, but callers should be aware that the + /// built bytes may change if mutations occur between builds. + /// + /// Note: [`close`](Self::close) prevents further appends but does not prevent writer state modifications. + pub fn build(&mut self) -> io::Result { + if self.aborted { + return Err(io::Error::other( + "Attempting to build an aborted record batch", + )); + } + + if let Some(ref cached) = self.built_buffer { + return Ok(cached.clone()); + } + + self.write_batch_header()?; + let bytes = self.buffer.clone().freeze(); + self.built_buffer = Some(bytes); + Ok(self.built_buffer.as_ref().unwrap().clone()) + } + + /// Get the writer ID. + pub fn writer_id(&self) -> i64 { + self.writer_id + } + + /// Get the batch sequence. + pub fn batch_sequence(&self) -> i32 { + self.batch_sequence + } + + /// Check if the builder is closed. + pub fn is_closed(&self) -> bool { + self.is_closed + } + + /// Abort the builder. + /// After aborting, no more records can be appended and the batch cannot be built. + pub fn abort(&mut self) { + self.aborted = true; + } + + /// Close the builder. + /// After closing, no more records can be appended, but the batch can still be built. + pub fn close(&mut self) -> io::Result<()> { + if self.aborted { + return Err(io::Error::other( + "Cannot close KvRecordBatchBuilder as it has already been aborted", + )); + } + self.is_closed = true; + Ok(()) + } + + /// Get the current size in bytes of the batch. + pub fn get_size_in_bytes(&self) -> usize { + self.size_in_bytes + } + + // ----------------------- Internal methods ------------------------------- + + /// Write the batch header. + fn write_batch_header(&mut self) -> io::Result<()> { + let size_without_length = self.size_in_bytes - LENGTH_LENGTH; + let total_size = i32::try_from(size_without_length).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("Batch size {} exceeds i32::MAX", size_without_length), + ) + })?; + + // Compute attributes before borrowing buffer mutably + let attributes = self.compute_attributes(); + + // Write to the beginning of the buffer + let header = &mut self.buffer[0..RECORD_BATCH_HEADER_SIZE]; + + // Write length + header[LENGTH_OFFSET..LENGTH_OFFSET + LENGTH_LENGTH] + .copy_from_slice(&total_size.to_le_bytes()); + + // Write magic + header[MAGIC_OFFSET] = self.magic; + + // Write empty CRC first (will update later) + header[CRC_OFFSET..CRC_OFFSET + 4].copy_from_slice(&0u32.to_le_bytes()); + + // Write schema ID + header[SCHEMA_ID_OFFSET..SCHEMA_ID_OFFSET + 2] + .copy_from_slice(&(self.schema_id as i16).to_le_bytes()); + + // Write attributes + header[ATTRIBUTES_OFFSET] = attributes; + + // Write writer ID + header[WRITE_CLIENT_ID_OFFSET..WRITE_CLIENT_ID_OFFSET + 8] + .copy_from_slice(&self.writer_id.to_le_bytes()); + + // Write batch sequence + header[BATCH_SEQUENCE_OFFSET..BATCH_SEQUENCE_OFFSET + 4] + .copy_from_slice(&self.batch_sequence.to_le_bytes()); + + // Write record count + header[RECORDS_COUNT_OFFSET..RECORDS_COUNT_OFFSET + 4] + .copy_from_slice(&self.current_record_number.to_le_bytes()); + + // Compute and update CRC + let crc = crc32c::crc32c(&self.buffer[SCHEMA_ID_OFFSET..self.size_in_bytes]); + self.buffer[CRC_OFFSET..CRC_OFFSET + 4].copy_from_slice(&crc.to_le_bytes()); + + Ok(()) + } + + /// Compute the attributes byte. + fn compute_attributes(&self) -> u8 { + // Currently no attributes are used + 0 + } +} + +impl Drop for KvRecordBatchBuilder { + fn drop(&mut self) { + // Warn if the builder has records but was never built or was aborted + if self.current_record_number > 0 && !self.aborted && self.built_buffer.is_none() { + eprintln!( + "Warning: KvRecordBatchBuilder dropped with {} record(s) that were never built. \ + Call build() to serialize the batch before dropping.", + self.current_record_number + ); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::row::compacted::CompactedRowWriter; + + // Helper function to create a CompactedRowWriter with a single bytes field for testing + fn create_test_row(data: &[u8]) -> CompactedRowWriter { + let mut writer = CompactedRowWriter::new(1); + writer.write_bytes(data); + writer + } + + #[test] + fn test_builder_basic_workflow() { + let schema_id = 42; + let write_limit = 4096; + let mut builder = KvRecordBatchBuilder::new(schema_id, write_limit, KvFormat::COMPACTED); + + // Test initial state + assert!(!builder.is_closed()); + assert_eq!(builder.writer_id(), NO_WRITER_ID); + assert_eq!(builder.batch_sequence(), NO_BATCH_SEQUENCE); + + // Test writer state + builder.set_writer_state(100, 5); + assert_eq!(builder.writer_id(), 100); + assert_eq!(builder.batch_sequence(), 5); + + // Test appending records + let key1 = b"key1"; + let value1 = create_test_row(b"value1"); + assert!(builder.has_room_for_row(key1, Some(&value1))); + builder.append_row(key1, Some(&value1)).unwrap(); + + let key2 = b"key2"; + assert!(builder.has_room_for_row::(key2, None)); + builder + .append_row::(key2, None) + .unwrap(); + + // Test close and build + builder.close().unwrap(); + assert!(builder.is_closed()); + + let bytes = builder.build().unwrap(); + assert!(bytes.len() > RECORD_BATCH_HEADER_SIZE); + + // Building again should return cached result + let bytes2 = builder.build().unwrap(); + assert_eq!(bytes.len(), bytes2.len()); + } + + #[test] + fn test_builder_lifecycle() { + // Test abort behavior + let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); + let value = create_test_row(b"value"); + builder.append_row(b"key", Some(&value)).unwrap(); + builder.abort(); + assert!( + builder + .append_row::(b"key2", None) + .is_err() + ); + assert!(builder.build().is_err()); + assert!(builder.close().is_err()); + + // Test close behavior + let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); + let value = create_test_row(b"value"); + builder.append_row(b"key", Some(&value)).unwrap(); + builder.close().unwrap(); + assert!( + builder + .append_row::(b"key2", None) + .is_err() + ); // Can't append after close + assert!(builder.build().is_ok()); // But can still build + } + + #[test] + fn test_write_limit_enforcement() { + let write_limit = 100; // Very small limit + let mut builder = KvRecordBatchBuilder::new(1, write_limit, KvFormat::COMPACTED); + + // Test has_room_for_row helper + let large_key = vec![0u8; 1000]; + let large_value = vec![1u8; 1000]; + let large_row = create_test_row(&large_value); + assert!(!builder.has_room_for_row(&large_key, Some(&large_row))); + let small_value = create_test_row(b"value"); + assert!(builder.has_room_for_row(b"key", Some(&small_value))); + + // Test append enforcement - add small record first + builder.append_row(b"key", Some(&small_value)).unwrap(); + + // Try to add large record that exceeds limit (reuse large_row from above) + let result = builder.append_row(b"key2", Some(&large_row)); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().kind(), io::ErrorKind::WriteZero); + } + + #[test] + fn test_append_checks_record_count_limit() { + let mut builder = KvRecordBatchBuilder::new(1, 100000, KvFormat::COMPACTED); + builder.current_record_number = i32::MAX - 1; + + let value1 = create_test_row(b"value1"); + builder.append_row(b"key1", Some(&value1)).unwrap(); + + let value2 = create_test_row(b"value2"); + let result = builder.append_row(b"key2", Some(&value2)); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidInput); + } + + #[test] + #[should_panic(expected = "schema_id shouldn't be greater than")] + fn test_builder_invalid_schema_id() { + KvRecordBatchBuilder::new(i16::MAX as i32 + 1, 4096, KvFormat::COMPACTED); + } + + #[test] + fn test_cache_invalidation_on_append() { + let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); + builder.set_writer_state(100, 5); + + let value1 = create_test_row(b"value1"); + builder.append_row(b"key1", Some(&value1)).unwrap(); + let bytes1 = builder.build().unwrap(); + let len1 = bytes1.len(); + + // Append another record - this should invalidate the cache + let value2 = create_test_row(b"value2"); + builder.append_row(b"key2", Some(&value2)).unwrap(); + let bytes2 = builder.build().unwrap(); + let len2 = bytes2.len(); + + // Verify the second build includes both records + assert!(len2 > len1, "Second build should be larger"); + + use crate::record::kv::KvRecordBatch; + let batch = KvRecordBatch::new(bytes2, 0); + assert!(batch.is_valid()); + assert_eq!(batch.record_count().unwrap(), 2, "Should have 2 records"); + } + + #[test] + fn test_cache_invalidation_on_set_writer_state() { + let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); + + builder.set_writer_state(100, 5); + let value = create_test_row(b"value"); + builder.append_row(b"key", Some(&value)).unwrap(); + let bytes1 = builder.build().unwrap(); + + // Change writer state - this should invalidate the cache + builder.set_writer_state(200, 10); + let bytes2 = builder.build().unwrap(); + + assert_ne!( + bytes1, bytes2, + "Bytes should differ after writer state change" + ); + + use crate::record::kv::KvRecordBatch; + let batch1 = KvRecordBatch::new(bytes1, 0); + let batch2 = KvRecordBatch::new(bytes2, 0); + + assert_eq!(batch1.writer_id().unwrap(), 100); + assert_eq!(batch1.batch_sequence().unwrap(), 5); + + assert_eq!(batch2.writer_id().unwrap(), 200); + assert_eq!(batch2.batch_sequence().unwrap(), 10); + } + + #[test] + fn test_builder_with_compacted_row_writer() { + use crate::metadata::{DataType, IntType, StringType}; + use crate::record::kv::KvRecordBatch; + use crate::row::InternalRow; + use crate::row::compacted::CompactedRow; + + let mut builder = KvRecordBatchBuilder::new(1, 100000, KvFormat::COMPACTED); + builder.set_writer_state(100, 5); + + let types = vec![ + DataType::Int(IntType::new()), + DataType::String(StringType::new()), + ]; + + // Create and append first record with CompactedRowWriter + let mut row_writer1 = CompactedRowWriter::new(2); + row_writer1.write_int(42); + row_writer1.write_string("hello"); + + let key1 = b"key1"; + assert!(builder.has_room_for_row(key1, Some(&row_writer1))); + builder.append_row(key1, Some(&row_writer1)).unwrap(); + + // Create and append second record + let mut row_writer2 = CompactedRowWriter::new(2); + row_writer2.write_int(100); + row_writer2.write_string("world"); + + let key2 = b"key2"; + builder.append_row(key2, Some(&row_writer2)).unwrap(); + + // Append a deletion record + let key3 = b"key3"; + builder + .append_row::(key3, None) + .unwrap(); + + // Build and verify + builder.close().unwrap(); + let bytes = builder.build().unwrap(); + + let batch = KvRecordBatch::new(bytes, 0); + assert!(batch.is_valid()); + assert_eq!(batch.record_count().unwrap(), 3); + assert_eq!(batch.writer_id().unwrap(), 100); + assert_eq!(batch.batch_sequence().unwrap(), 5); + + // Read back and verify records + let records: Vec<_> = batch.records().unwrap().collect(); + assert_eq!(records.len(), 3); + + // Verify first record + let record1 = records[0].as_ref().unwrap(); + assert_eq!(record1.key().as_ref(), key1); + let row1 = CompactedRow::from_bytes(&types, record1.value().unwrap()); + assert_eq!(row1.get_int(0), 42); + assert_eq!(row1.get_string(1), "hello"); + + // Verify second record + let record2 = records[1].as_ref().unwrap(); + assert_eq!(record2.key().as_ref(), key2); + let row2 = CompactedRow::from_bytes(&types, record2.value().unwrap()); + assert_eq!(row2.get_int(0), 100); + assert_eq!(row2.get_string(1), "world"); + + // Verify deletion record + let record3 = records[2].as_ref().unwrap(); + assert_eq!(record3.key().as_ref(), key3); + assert!(record3.value().is_none()); + } + + #[test] + fn test_kv_format_validation() { + let mut row_writer = CompactedRowWriter::new(1); + row_writer.write_int(42); + + // INDEXED format should reject append_row + let mut indexed_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::INDEXED); + let result = indexed_builder.append_row(b"key", Some(&row_writer)); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidInput); + + // COMPACTED format should accept append_row + let mut compacted_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); + let result = compacted_builder.append_row(b"key", Some(&row_writer)); + assert!(result.is_ok()); + } +} diff --git a/fluss-rust/crates/fluss/src/record/kv/mod.rs b/fluss-rust/crates/fluss/src/record/kv/mod.rs new file mode 100644 index 0000000000..ecb762df16 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/mod.rs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Key-Value record and batch implementations. + +mod kv_record; +mod kv_record_batch; +mod kv_record_batch_builder; + +pub use kv_record::{KvRecord, LENGTH_LENGTH as KV_RECORD_LENGTH_LENGTH}; +pub use kv_record_batch::*; +pub use kv_record_batch_builder::*; + +/// Current KV magic value +pub const CURRENT_KV_MAGIC_VALUE: u8 = 0; + +/// No writer ID constant +pub const NO_WRITER_ID: i64 = -1; + +/// No batch sequence constant +pub const NO_BATCH_SEQUENCE: i32 = -1; diff --git a/fluss-rust/crates/fluss/src/record/mod.rs b/fluss-rust/crates/fluss/src/record/mod.rs index 35928ea082..c5a3f8e4b6 100644 --- a/fluss-rust/crates/fluss/src/record/mod.rs +++ b/fluss-rust/crates/fluss/src/record/mod.rs @@ -22,6 +22,7 @@ use std::collections::HashMap; mod arrow; mod error; +pub mod kv; pub use arrow::*; diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs index 84a6b22724..1152b0c5d2 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs @@ -30,6 +30,12 @@ pub struct CompactedKeyWriter { delegate: CompactedRowWriter, } +impl Default for CompactedKeyWriter { + fn default() -> Self { + Self::new() + } +} + impl CompactedKeyWriter { pub fn new() -> CompactedKeyWriter { CompactedKeyWriter { diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs index c053d4ec31..5ec260897e 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs @@ -19,6 +19,7 @@ use crate::row::compacted::compacted_row::calculate_bit_set_width_in_bytes; use crate::{ metadata::DataType, row::{Datum, GenericRow, compacted::compacted_row_writer::CompactedRowWriter}, + util::varint::{read_unsigned_varint_at, read_unsigned_varint_u64_at}, }; use std::str::from_utf8; @@ -150,36 +151,18 @@ impl<'a> CompactedRowReader<'a> { (val, next_pos) } - pub fn read_int(&self, mut pos: usize) -> (i32, usize) { - let mut result: u32 = 0; - let mut shift = 0; - - for _ in 0..CompactedRowWriter::MAX_INT_SIZE { - let (b, next_pos) = self.read_byte(pos); - pos = next_pos; - result |= ((b & 0x7F) as u32) << shift; - if (b & 0x80) == 0 { - return (result as i32, pos); - } - shift += 7; + pub fn read_int(&self, pos: usize) -> (i32, usize) { + match read_unsigned_varint_at(self.segment, pos, CompactedRowWriter::MAX_INT_SIZE) { + Ok((value, next_pos)) => (value as i32, next_pos), + Err(_) => panic!("Invalid VarInt32 input stream."), } - panic!("Invalid VarInt32 input stream."); } - pub fn read_long(&self, mut pos: usize) -> (i64, usize) { - let mut result: u64 = 0; - let mut shift = 0; - - for _ in 0..CompactedRowWriter::MAX_LONG_SIZE { - let (b, next_pos) = self.read_byte(pos); - pos = next_pos; - result |= ((b & 0x7F) as u64) << shift; - if (b & 0x80) == 0 { - return (result as i64, pos); - } - shift += 7; + pub fn read_long(&self, pos: usize) -> (i64, usize) { + match read_unsigned_varint_u64_at(self.segment, pos, CompactedRowWriter::MAX_LONG_SIZE) { + Ok((value, next_pos)) => (value as i64, next_pos), + Err(_) => panic!("Invalid VarInt64 input stream."), } - panic!("Invalid VarInt64 input stream."); } pub fn read_float(&self, pos: usize) -> (f32, usize) { diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs index 4f535c6bf0..63b32a3dca 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs @@ -18,7 +18,9 @@ use bytes::{Bytes, BytesMut}; use std::cmp; +use crate::row::BinaryRow; use crate::row::compacted::compacted_row::calculate_bit_set_width_in_bytes; +use crate::util::varint::{write_unsigned_varint_to_slice, write_unsigned_varint_u64_to_slice}; // Writer for CompactedRow // Reference implementation: @@ -125,25 +127,16 @@ impl CompactedRowWriter { pub fn write_int(&mut self, value: i32) { self.ensure_capacity(Self::MAX_INT_SIZE); - let mut v = value as u32; - while (v & !0x7F) != 0 { - self.buffer[self.position] = ((v as u8) & 0x7F) | 0x80; - self.position += 1; - v >>= 7; - } - self.buffer[self.position] = v as u8; - self.position += 1; + let bytes_written = + write_unsigned_varint_to_slice(value as u32, &mut self.buffer[self.position..]); + self.position += bytes_written; } + pub fn write_long(&mut self, value: i64) { self.ensure_capacity(Self::MAX_LONG_SIZE); - let mut v = value as u64; - while (v & !0x7F) != 0 { - self.buffer[self.position] = ((v as u8) & 0x7F) | 0x80; - self.position += 1; - v >>= 7; - } - self.buffer[self.position] = v as u8; - self.position += 1; + let bytes_written = + write_unsigned_varint_u64_to_slice(value as u64, &mut self.buffer[self.position..]); + self.position += bytes_written; } pub fn write_float(&mut self, value: f32) { @@ -154,3 +147,9 @@ impl CompactedRowWriter { self.write_raw(&value.to_ne_bytes()); } } + +impl BinaryRow for CompactedRowWriter { + fn as_bytes(&self) -> &[u8] { + self.buffer() + } +} diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index c321ab9d6b..144d64fd88 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -20,13 +20,18 @@ mod column; mod datum; mod binary; -mod compacted; +pub mod compacted; mod encode; mod field_getter; pub use column::*; pub use datum::*; +pub trait BinaryRow { + /// Returns the binary representation of this row as a byte slice. + fn as_bytes(&self) -> &[u8]; +} + // TODO make functions return Result for better error handling pub trait InternalRow { /// Returns the number of fields in this row diff --git a/fluss-rust/crates/fluss/src/util/mod.rs b/fluss-rust/crates/fluss/src/util/mod.rs index 5f67290e43..d191615e86 100644 --- a/fluss-rust/crates/fluss/src/util/mod.rs +++ b/fluss-rust/crates/fluss/src/util/mod.rs @@ -16,6 +16,7 @@ // under the License. pub mod murmur_hash; +pub mod varint; use crate::metadata::TableBucket; use linked_hash_map::LinkedHashMap; diff --git a/fluss-rust/crates/fluss/src/util/varint.rs b/fluss-rust/crates/fluss/src/util/varint.rs new file mode 100644 index 0000000000..96fd1f50bf --- /dev/null +++ b/fluss-rust/crates/fluss/src/util/varint.rs @@ -0,0 +1,502 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Variable-length integer encoding utilities. +//! +//! This module provides utilities for encoding integers in variable-length format, +//! which can save space when encoding small integers. The encoding uses 7 bits per byte +//! with the most significant bit as a continuation flag. + +use bytes::BufMut; +use std::io::{self, Read, Write}; + +/// Write an unsigned integer in variable-length format. +/// +/// The encoding uses 7 bits per byte with the MSB set to 1 if more bytes follow. +/// This matches the encoding used in Google Protocol Buffers. +#[allow(dead_code)] +pub fn write_unsigned_varint(value: u32, writer: &mut W) -> io::Result { + let mut v = value; + let mut bytes_written = 0; + + while (v & !0x7F) != 0 { + writer.write_all(&[((v as u8) & 0x7F) | 0x80])?; + bytes_written += 1; + v >>= 7; + } + writer.write_all(&[v as u8])?; + bytes_written += 1; + + Ok(bytes_written) +} + +/// Write an unsigned integer in variable-length format to a buffer. +pub fn write_unsigned_varint_buf(value: u32, buf: &mut impl BufMut) { + let mut v = value; + + while (v & !0x7F) != 0 { + buf.put_u8(((v as u8) & 0x7F) | 0x80); + v >>= 7; + } + buf.put_u8(v as u8); +} + +/// Read an unsigned integer stored in variable-length format. +#[allow(dead_code)] +pub fn read_unsigned_varint(reader: &mut R) -> io::Result { + let mut tmp = [0u8; 1]; + reader.read_exact(&mut tmp)?; + let mut byte = tmp[0] as i8; + + if byte >= 0 { + return Ok(byte as u32); + } + + let mut result = (byte & 127) as u32; + + reader.read_exact(&mut tmp)?; + byte = tmp[0] as i8; + if byte >= 0 { + result |= (byte as u32) << 7; + } else { + result |= ((byte & 127) as u32) << 7; + + reader.read_exact(&mut tmp)?; + byte = tmp[0] as i8; + if byte >= 0 { + result |= (byte as u32) << 14; + } else { + result |= ((byte & 127) as u32) << 14; + + reader.read_exact(&mut tmp)?; + byte = tmp[0] as i8; + if byte >= 0 { + result |= (byte as u32) << 21; + } else { + result |= ((byte & 127) as u32) << 21; + + reader.read_exact(&mut tmp)?; + byte = tmp[0] as i8; + result |= (byte as u32) << 28; + + if byte < 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid u32 varint encoding: too many bytes (most significant bit in the 5th byte is set)", + )); + } + } + } + } + + Ok(result) +} + +/// Read an unsigned integer from a byte slice in variable-length format. +pub fn read_unsigned_varint_bytes(bytes: &[u8]) -> io::Result<(u32, usize)> { + if bytes.is_empty() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Cannot read varint from empty buffer", + )); + } + + let mut byte = bytes[0] as i8; + let mut index = 1; + + if byte >= 0 { + return Ok((byte as u32, index)); + } + + let mut result = (byte & 127) as u32; + + if index >= bytes.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Incomplete varint", + )); + } + byte = bytes[index] as i8; + index += 1; + if byte >= 0 { + result |= (byte as u32) << 7; + } else { + result |= ((byte & 127) as u32) << 7; + + if index >= bytes.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Incomplete varint", + )); + } + byte = bytes[index] as i8; + index += 1; + if byte >= 0 { + result |= (byte as u32) << 14; + } else { + result |= ((byte & 127) as u32) << 14; + + if index >= bytes.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Incomplete varint", + )); + } + byte = bytes[index] as i8; + index += 1; + if byte >= 0 { + result |= (byte as u32) << 21; + } else { + result |= ((byte & 127) as u32) << 21; + + if index >= bytes.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Incomplete varint", + )); + } + byte = bytes[index] as i8; + index += 1; + result |= (byte as u32) << 28; + + if byte < 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid u32 varint encoding: too many bytes (most significant bit in the 5th byte is set)", + )); + } + } + } + } + + Ok((result, index)) +} + +/// Calculate the number of bytes needed to encode a u32 in variable-length format. +/// +/// Varint encoding uses 7 bits per byte, so we need `ceil(bits_used / 7)` bytes. +/// This function computes that efficiently using the formula: +/// +/// size = ((38 - leading_zeros) * 74899) >> 19 + (leading_zeros >> 5) +/// +/// Where: +/// - `38 = 32 + 6` (6 accounts for ceiling in division) +/// - `74899 = 2^19 / 7` (enables division by 7 via multiply + shift) +/// - `leading_zeros >> 5` adds 1 when value is 0 (minimum 1 byte) +pub fn size_of_unsigned_varint(value: u32) -> usize { + let leading_zeros = value.leading_zeros(); + let leading_zeros_below_38_divided_by_7 = ((38 - leading_zeros) * 0b10010010010010011) >> 19; + (leading_zeros_below_38_divided_by_7 + (leading_zeros >> 5)) as usize +} + +/// Calculate the number of bytes needed to encode a u64 in variable-length format. +/// +/// Varint encoding uses 7 bits per byte, so we need `ceil(bits_used / 7)` bytes. +/// This function computes that efficiently using the formula: +/// +/// size = ((70 - leading_zeros) * 74899) >> 19 + (leading_zeros >> 6) +/// +/// - `70 = 64 + 6` (6 accounts for ceiling in division) +/// - `74899 = 2^19 / 7` (enables division by 7 via multiply + shift) +/// - `leading_zeros >> 6` adds 1 when value is 0 (minimum 1 byte) +#[allow(dead_code)] +pub fn size_of_unsigned_varint_u64(value: u64) -> usize { + let leading_zeros = value.leading_zeros(); + let leading_zeros_below_70_divided_by_7 = ((70 - leading_zeros) * 0b10010010010010011) >> 19; + (leading_zeros_below_70_divided_by_7 + (leading_zeros >> 6)) as usize +} + +/// Write an unsigned 64-bit integer in variable-length format to a buffer. +#[allow(dead_code)] +pub fn write_unsigned_varint_u64_buf(value: u64, buf: &mut impl BufMut) { + let mut v = value; + while (v & !0x7F) != 0 { + buf.put_u8(((v as u8) & 0x7F) | 0x80); + v >>= 7; + } + buf.put_u8(v as u8); +} + +/// Write directly to a mutable byte slice, returning the number of bytes written. +/// Used by CompactedRowWriter which manages its own position. +/// +/// # Panics +/// Panics if the slice is too small to hold the encoded varint. +/// The slice must have at least 5 bytes available (the maximum size for a u32 varint). +/// Use [`size_of_unsigned_varint`] to calculate the required size beforehand. +pub fn write_unsigned_varint_to_slice(value: u32, slice: &mut [u8]) -> usize { + let mut v = value; + let mut written = 0; + + while (v & !0x7F) != 0 { + slice[written] = ((v as u8) & 0x7F) | 0x80; + written += 1; + v >>= 7; + } + slice[written] = v as u8; + written + 1 +} + +/// Write unsigned 64-bit varint directly to a mutable byte slice. +/// +/// # Panics +/// Panics if the slice is too small to hold the encoded varint. +/// The slice must have at least 10 bytes available (the maximum size for a u64 varint). +pub fn write_unsigned_varint_u64_to_slice(value: u64, slice: &mut [u8]) -> usize { + let mut v = value; + let mut written = 0; + + while (v & !0x7F) != 0 { + slice[written] = ((v as u8) & 0x7F) | 0x80; + written += 1; + v >>= 7; + } + slice[written] = v as u8; + written + 1 +} + +/// Read unsigned varint from a slice starting at given position. +/// Returns (value, next_position). +/// Used by CompactedRowReader which manages positions. +pub fn read_unsigned_varint_at( + slice: &[u8], + mut pos: usize, + max_bytes: usize, +) -> io::Result<(u32, usize)> { + let mut result: u32 = 0; + let mut shift = 0; + + for _ in 0..max_bytes { + if pos >= slice.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Unexpected end of varint", + )); + } + let b = slice[pos]; + pos += 1; + result |= ((b & 0x7F) as u32) << shift; + if (b & 0x80) == 0 { + return Ok((result, pos)); + } + shift += 7; + } + + Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid VarInt32 input stream", + )) +} + +/// Read unsigned 64-bit varint from a slice starting at given position. +pub fn read_unsigned_varint_u64_at( + slice: &[u8], + mut pos: usize, + max_bytes: usize, +) -> io::Result<(u64, usize)> { + let mut result: u64 = 0; + let mut shift = 0; + + for _ in 0..max_bytes { + if pos >= slice.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Unexpected end of varint", + )); + } + let b = slice[pos]; + pos += 1; + result |= ((b & 0x7F) as u64) << shift; + if (b & 0x80) == 0 { + return Ok((result, pos)); + } + shift += 7; + } + + Err(io::Error::new( + io::ErrorKind::InvalidData, + "Invalid VarInt64 input stream", + )) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Cursor; + + #[test] + fn test_unsigned_varint_round_trip() { + let test_values = vec![ + 0u32, + 1, + 127, + 128, + 255, + 256, + 16383, + 16384, + 2097151, + 2097152, + 268435455, + 268435456, + u32::MAX, + ]; + + for value in test_values { + // Test with Write trait + let mut buffer = Vec::new(); + let written = write_unsigned_varint(value, &mut buffer).unwrap(); + + let mut reader = Cursor::new(&buffer); + let read_value = read_unsigned_varint(&mut reader).unwrap(); + + assert_eq!(value, read_value, "Round trip failed for value {}", value); + assert_eq!( + written, + buffer.len(), + "Bytes written mismatch for value {}", + value + ); + + // Test with BufMut + let mut buf = bytes::BytesMut::new(); + write_unsigned_varint_buf(value, &mut buf); + assert_eq!(buf.len(), written, "BufMut write length mismatch"); + + // Test size calculation + let calculated_size = size_of_unsigned_varint(value); + assert_eq!( + calculated_size, + buffer.len(), + "Size calculation failed for value {}", + value + ); + + // Test reading from bytes + let (read_value_bytes, bytes_read) = read_unsigned_varint_bytes(&buffer).unwrap(); + assert_eq!( + value, read_value_bytes, + "Bytes read failed for value {}", + value + ); + assert_eq!( + bytes_read, + buffer.len(), + "Bytes read count mismatch for value {}", + value + ); + } + } + + #[test] + fn test_size_of_unsigned_varint() { + assert_eq!(size_of_unsigned_varint(0), 1); + assert_eq!(size_of_unsigned_varint(127), 1); + assert_eq!(size_of_unsigned_varint(128), 2); + assert_eq!(size_of_unsigned_varint(16383), 2); + assert_eq!(size_of_unsigned_varint(16384), 3); + assert_eq!(size_of_unsigned_varint(2097151), 3); + assert_eq!(size_of_unsigned_varint(2097152), 4); + assert_eq!(size_of_unsigned_varint(268435455), 4); + assert_eq!(size_of_unsigned_varint(268435456), 5); + assert_eq!(size_of_unsigned_varint(u32::MAX), 5); + } + + #[test] + fn test_size_of_unsigned_varint_u64() { + assert_eq!(size_of_unsigned_varint_u64(0), 1); + assert_eq!(size_of_unsigned_varint_u64(127), 1); + assert_eq!(size_of_unsigned_varint_u64(128), 2); + assert_eq!(size_of_unsigned_varint_u64(16383), 2); + assert_eq!(size_of_unsigned_varint_u64(16384), 3); + assert_eq!(size_of_unsigned_varint_u64(2097151), 3); + assert_eq!(size_of_unsigned_varint_u64(2097152), 4); + assert_eq!(size_of_unsigned_varint_u64(268435455), 4); + assert_eq!(size_of_unsigned_varint_u64(268435456), 5); + assert_eq!(size_of_unsigned_varint_u64(u32::MAX as u64), 5); + assert_eq!(size_of_unsigned_varint_u64(34359738367), 5); + assert_eq!(size_of_unsigned_varint_u64(34359738368), 6); + assert_eq!(size_of_unsigned_varint_u64(4398046511103), 6); + assert_eq!(size_of_unsigned_varint_u64(4398046511104), 7); + assert_eq!(size_of_unsigned_varint_u64(562949953421311), 7); + assert_eq!(size_of_unsigned_varint_u64(562949953421312), 8); + assert_eq!(size_of_unsigned_varint_u64(72057594037927935), 8); + assert_eq!(size_of_unsigned_varint_u64(72057594037927936), 9); + assert_eq!(size_of_unsigned_varint_u64(9223372036854775807), 9); + assert_eq!(size_of_unsigned_varint_u64(9223372036854775808), 10); + assert_eq!(size_of_unsigned_varint_u64(u64::MAX), 10); + } + + #[test] + fn test_read_unsigned_varint_bytes_error_handling() { + // Empty buffer + assert!(read_unsigned_varint_bytes(&[]).is_err()); + + // Incomplete varint (continuation bit set but no next byte) + assert!(read_unsigned_varint_bytes(&[0x80]).is_err()); + assert!(read_unsigned_varint_bytes(&[0xFF, 0x80]).is_err()); + } + + #[test] + fn test_write_read_to_slice() { + // Test u32 varint to slice + let test_values_u32 = vec![0u32, 127, 128, 16384, u32::MAX]; + + for value in test_values_u32 { + let mut buffer = vec![0u8; 10]; + let written = write_unsigned_varint_to_slice(value, &mut buffer); + + let (read_value, next_pos) = read_unsigned_varint_at(&buffer, 0, 5).unwrap(); + assert_eq!(value, read_value); + assert_eq!(written, next_pos); + } + + // Test u64 varint to slice + let test_values_u64 = vec![0u64, 127, 128, 16384, u32::MAX as u64, u64::MAX]; + + for value in test_values_u64 { + let mut buffer = vec![0u8; 10]; + let written = write_unsigned_varint_u64_to_slice(value, &mut buffer); + + let (read_value, next_pos) = read_unsigned_varint_u64_at(&buffer, 0, 10).unwrap(); + assert_eq!(value, read_value); + assert_eq!(written, next_pos); + } + } + + #[test] + fn test_read_at_with_offset() { + // Write multiple varints and read at different positions + let mut buffer = vec![0u8; 20]; + let mut pos = 0; + + pos += write_unsigned_varint_to_slice(127, &mut buffer[pos..]); + pos += write_unsigned_varint_to_slice(16384, &mut buffer[pos..]); + let end_pos = pos + write_unsigned_varint_to_slice(u32::MAX, &mut buffer[pos..]); + + // Read back + let (val1, pos1) = read_unsigned_varint_at(&buffer, 0, 5).unwrap(); + assert_eq!(val1, 127); + + let (val2, pos2) = read_unsigned_varint_at(&buffer, pos1, 5).unwrap(); + assert_eq!(val2, 16384); + + let (val3, pos3) = read_unsigned_varint_at(&buffer, pos2, 5).unwrap(); + assert_eq!(val3, u32::MAX); + assert_eq!(pos3, end_pos); + } +} From 207a7d183c34d8230bdf049e02dc54754723e97b Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Thu, 15 Jan 2026 09:25:31 +0000 Subject: [PATCH 065/287] chore: Introduce WriteFormat and various small changes to unblock Upsert implementation (#158) --- fluss-rust/crates/fluss/Cargo.toml | 2 + .../crates/fluss/src/client/write/mod.rs | 2 + .../fluss/src/client/write/write_format.rs | 65 +++++++++++++++++++ fluss-rust/crates/fluss/src/error.rs | 9 +++ .../fluss/src/metadata/data_lake_format.rs | 3 + .../crates/fluss/src/metadata/datatype.rs | 23 +++++++ fluss-rust/crates/fluss/src/metadata/table.rs | 22 ++++++- .../crates/fluss/src/row/field_getter.rs | 14 +++- 8 files changed, 138 insertions(+), 2 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/client/write/write_format.rs diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index e8c851f7b7..8942ffc7db 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -60,6 +60,8 @@ tempfile = "3.23.0" snafu = "0.8.3" scopeguard = "1.2.0" delegate = "0.13.5" +strum = "0.26" +strum_macros = "0.26" [target.'cfg(target_arch = "wasm32")'.dependencies] jiff = { workspace = true, features = ["js"] } diff --git a/fluss-rust/crates/fluss/src/client/write/mod.rs b/fluss-rust/crates/fluss/src/client/write/mod.rs index d79418bfc2..00a71c51fa 100644 --- a/fluss-rust/crates/fluss/src/client/write/mod.rs +++ b/fluss-rust/crates/fluss/src/client/write/mod.rs @@ -30,8 +30,10 @@ pub(crate) mod broadcast; mod bucket_assigner; mod sender; +mod write_format; mod writer_client; +pub use write_format::WriteFormat; pub use writer_client::WriterClient; pub struct WriteRecord<'a> { diff --git a/fluss-rust/crates/fluss/src/client/write/write_format.rs b/fluss-rust/crates/fluss/src/client/write/write_format.rs new file mode 100644 index 0000000000..d65e42de68 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/write_format.rs @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::KvFormat; +use std::fmt::Display; + +pub enum WriteFormat { + ArrowLog, + CompactedLog, + CompactedKv, +} + +impl WriteFormat { + pub const fn is_log(&self) -> bool { + matches!(self, Self::ArrowLog | Self::CompactedLog) + } + + pub fn is_kv(&self) -> bool { + !self.is_log() + } + + pub fn to_kv_format(&self) -> Result { + match self { + WriteFormat::CompactedKv => Ok(KvFormat::COMPACTED), + other => Err(IllegalArgument { + message: format!("WriteFormat `{}` is not a KvFormat", other), + }), + } + } + + pub fn from_kv_format(kv_format: &KvFormat) -> Result { + match kv_format { + KvFormat::COMPACTED => Ok(WriteFormat::CompactedKv), + other => Err(IllegalArgument { + message: format!("Unknown KvFormat: `{}`", other), + }), + } + } +} + +impl Display for WriteFormat { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WriteFormat::ArrowLog => f.write_str("ArrowLog"), + WriteFormat::CompactedLog => f.write_str("CompactedLog"), + WriteFormat::CompactedKv => f.write_str("CompactedKv"), + } + } +} diff --git a/fluss-rust/crates/fluss/src/error.rs b/fluss-rust/crates/fluss/src/error.rs index e04fde14d7..0a368b7be9 100644 --- a/fluss-rust/crates/fluss/src/error.rs +++ b/fluss-rust/crates/fluss/src/error.rs @@ -21,6 +21,7 @@ pub use crate::rpc::{ApiError, FlussError}; use arrow_schema::ArrowError; use snafu::Snafu; use std::{io, result}; +use strum::ParseError; pub type Result = result::Result; @@ -155,3 +156,11 @@ impl From for Error { Error::FlussAPIError { api_error: value } } } + +impl From for Error { + fn from(value: ParseError) -> Self { + Error::IllegalArgument { + message: value.to_string(), + } + } +} diff --git a/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs b/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs index 76a23f8d96..c1861098c3 100644 --- a/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs +++ b/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs @@ -15,11 +15,14 @@ // specific language governing permissions and limitations // under the License. +use strum_macros::{Display, EnumString}; + /// Identifies the logical format of a data lake table supported by Fluss. /// /// This enum is typically used in metadata and configuration to distinguish /// between different table formats so that the appropriate integration and /// semantics can be applied. +#[derive(Debug, EnumString, Display, PartialEq)] pub enum DataLakeFormat { /// Apache Paimon data lake table format. Paimon, diff --git a/fluss-rust/crates/fluss/src/metadata/datatype.rs b/fluss-rust/crates/fluss/src/metadata/datatype.rs index c53cd273cb..dc1f40730f 100644 --- a/fluss-rust/crates/fluss/src/metadata/datatype.rs +++ b/fluss-rust/crates/fluss/src/metadata/datatype.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use crate::error::Error::IllegalArgument; +use crate::error::Result; use serde::{Deserialize, Serialize}; use std::fmt::{Display, Formatter}; @@ -857,6 +859,27 @@ impl RowType { self.fields.iter().position(|f| f.name == field_name) } + pub fn get_field_names(&self) -> Vec<&str> { + self.fields.iter().map(|f| f.name.as_str()).collect() + } + + pub fn project(&self, project_field_positions: &[usize]) -> Result { + Ok(RowType::with_nullable( + self.nullable, + project_field_positions + .iter() + .map(|pos| { + self.fields + .get(*pos) + .cloned() + .ok_or_else(|| IllegalArgument { + message: format!("invalid field position: {}", *pos), + }) + }) + .collect::>>()?, + )) + } + #[cfg(test)] pub fn with_data_types(data_types: Vec) -> Self { let mut fields: Vec = Vec::new(); diff --git a/fluss-rust/crates/fluss/src/metadata/table.rs b/fluss-rust/crates/fluss/src/metadata/table.rs index 4f6c04bc61..b1e8a90ba2 100644 --- a/fluss-rust/crates/fluss/src/metadata/table.rs +++ b/fluss-rust/crates/fluss/src/metadata/table.rs @@ -18,11 +18,13 @@ use crate::compression::ArrowCompressionInfo; use crate::error::Error::InvalidTableError; use crate::error::{Error, Result}; +use crate::metadata::DataLakeFormat; use crate::metadata::datatype::{DataField, DataType, RowType}; use core::fmt; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::fmt::{Display, Formatter}; +use strum_macros::EnumString; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct Column { @@ -603,7 +605,7 @@ impl LogFormat { } } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, EnumString)] pub enum KvFormat { INDEXED, COMPACTED, @@ -726,6 +728,24 @@ impl TableConfig { pub fn get_arrow_compression_info(&self) -> Result { ArrowCompressionInfo::from_conf(&self.properties) } + + pub fn get_datalake_format(&self) -> Result> { + self.properties + .get("table.datalake.format") + .map(|f| f.parse().map_err(Error::from)) + .transpose() + } + + pub fn get_kv_format(&self) -> Result { + // TODO: Consolidate configurations logic, constants, defaults in a single place + const DEFAULT_KV_FORMAT: &str = "COMPACTED"; + let kv_format = self + .properties + .get("table.kv.format") + .map(String::as_str) + .unwrap_or(DEFAULT_KV_FORMAT); + kv_format.parse().map_err(Into::into) + } } impl TableInfo { diff --git a/fluss-rust/crates/fluss/src/row/field_getter.rs b/fluss-rust/crates/fluss/src/row/field_getter.rs index 8e529e5446..97f9e395fc 100644 --- a/fluss-rust/crates/fluss/src/row/field_getter.rs +++ b/fluss-rust/crates/fluss/src/row/field_getter.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. -use crate::metadata::DataType; +use crate::metadata::{DataType, RowType}; use crate::row::{Datum, InternalRow}; +#[derive(Clone)] pub enum FieldGetter { Nullable(InnerFieldGetter), NonNullable(InnerFieldGetter), @@ -36,6 +37,16 @@ impl FieldGetter { } } + #[allow(dead_code)] + pub fn create_field_getters(row_type: &RowType) -> Box<[FieldGetter]> { + row_type + .fields() + .iter() + .enumerate() + .map(|(pos, field)| Self::create(field.data_type(), pos)) + .collect() + } + pub fn create(data_type: &DataType, pos: usize) -> FieldGetter { let inner_field_getter = match data_type { DataType::Char(t) => InnerFieldGetter::Char { @@ -66,6 +77,7 @@ impl FieldGetter { } } +#[derive(Clone)] pub enum InnerFieldGetter { Char { pos: usize, len: usize }, String { pos: usize }, From b9cc78bf37f04c6d2ff3bd0151dd9750ad90a9db Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Thu, 15 Jan 2026 15:45:41 +0000 Subject: [PATCH 066/287] feat: Introduce CompactedRowEncoder (#161) --- .../crates/fluss/src/metadata/datatype.rs | 10 ++- .../fluss/src/record/kv/kv_record_batch.rs | 13 +-- .../src/record/kv/kv_record_batch_builder.rs | 51 ++++++------ .../fluss/src/row/compacted/compacted_row.rs | 41 ++++++--- .../src/row/compacted/compacted_row_reader.rs | 18 +++- .../src/row/compacted/compacted_row_writer.rs | 59 +++++++------ .../src/row/encode/compacted_row_encoder.rs | 83 +++++++++++++++++++ fluss-rust/crates/fluss/src/row/encode/mod.rs | 68 ++++++++++++++- fluss-rust/crates/fluss/src/row/mod.rs | 4 +- 9 files changed, 263 insertions(+), 84 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs diff --git a/fluss-rust/crates/fluss/src/metadata/datatype.rs b/fluss-rust/crates/fluss/src/metadata/datatype.rs index dc1f40730f..f1574665eb 100644 --- a/fluss-rust/crates/fluss/src/metadata/datatype.rs +++ b/fluss-rust/crates/fluss/src/metadata/datatype.rs @@ -682,11 +682,11 @@ impl Default for BytesType { } impl BytesType { - pub fn new() -> Self { + pub const fn new() -> Self { Self::with_nullable(true) } - pub fn with_nullable(nullable: bool) -> Self { + pub const fn with_nullable(nullable: bool) -> Self { Self { nullable } } @@ -859,6 +859,10 @@ impl RowType { self.fields.iter().position(|f| f.name == field_name) } + pub fn field_types(&self) -> impl Iterator + '_ { + self.fields.iter().map(|f| &f.data_type) + } + pub fn get_field_names(&self) -> Vec<&str> { self.fields.iter().map(|f| f.name.as_str()).collect() } @@ -931,7 +935,7 @@ impl DataTypes { DataType::Binary(BinaryType::new(length)) } - pub fn bytes() -> DataType { + pub const fn bytes() -> DataType { DataType::Bytes(BytesType::new()) } diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs index fdd4ad7322..6ead64276a 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs @@ -321,8 +321,10 @@ impl Iterator for KvRecordIterator { #[cfg(test)] mod tests { use super::*; - use crate::metadata::KvFormat; + use crate::metadata::{DataTypes, KvFormat}; use crate::record::kv::{CURRENT_KV_MAGIC_VALUE, KvRecordBatchBuilder}; + use crate::row::binary::BinaryWriter; + use crate::row::compacted::CompactedRow; use bytes::{BufMut, BytesMut}; #[test] @@ -363,12 +365,13 @@ mod tests { let key1 = b"key1"; let mut value1_writer = CompactedRowWriter::new(1); value1_writer.write_bytes(&[1, 2, 3, 4, 5]); - builder.append_row(key1, Some(&value1_writer)).unwrap(); + + let data_types = &[DataTypes::bytes()]; + let row = &CompactedRow::from_bytes(data_types, value1_writer.buffer()); + builder.append_row(key1, Some(row)).unwrap(); let key2 = b"key2"; - builder - .append_row::(key2, None) - .unwrap(); + builder.append_row::(key2, None).unwrap(); let bytes = builder.build().unwrap(); diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs index 773c7789b5..7d1a7972e4 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs @@ -317,13 +317,14 @@ impl Drop for KvRecordBatchBuilder { #[cfg(test)] mod tests { use super::*; - use crate::row::compacted::CompactedRowWriter; + use crate::metadata::{DataType, DataTypes}; + use crate::row::binary::BinaryWriter; + use crate::row::compacted::{CompactedRow, CompactedRowWriter}; // Helper function to create a CompactedRowWriter with a single bytes field for testing - fn create_test_row(data: &[u8]) -> CompactedRowWriter { - let mut writer = CompactedRowWriter::new(1); - writer.write_bytes(data); - writer + fn create_test_row(data: &[u8]) -> CompactedRow<'_> { + const DATA_TYPE: &[DataType] = &[DataTypes::bytes()]; + CompactedRow::from_bytes(DATA_TYPE, data) } #[test] @@ -349,10 +350,8 @@ mod tests { builder.append_row(key1, Some(&value1)).unwrap(); let key2 = b"key2"; - assert!(builder.has_room_for_row::(key2, None)); - builder - .append_row::(key2, None) - .unwrap(); + assert!(builder.has_room_for_row::(key2, None)); + builder.append_row::(key2, None).unwrap(); // Test close and build builder.close().unwrap(); @@ -373,11 +372,7 @@ mod tests { let value = create_test_row(b"value"); builder.append_row(b"key", Some(&value)).unwrap(); builder.abort(); - assert!( - builder - .append_row::(b"key2", None) - .is_err() - ); + assert!(builder.append_row::(b"key2", None).is_err()); assert!(builder.build().is_err()); assert!(builder.close().is_err()); @@ -386,11 +381,7 @@ mod tests { let value = create_test_row(b"value"); builder.append_row(b"key", Some(&value)).unwrap(); builder.close().unwrap(); - assert!( - builder - .append_row::(b"key2", None) - .is_err() - ); // Can't append after close + assert!(builder.append_row::(b"key2", None).is_err()); // Can't append after close assert!(builder.build().is_ok()); // But can still build } @@ -510,23 +501,26 @@ mod tests { row_writer1.write_int(42); row_writer1.write_string("hello"); + let data_types = &[DataTypes::int(), DataTypes::string()]; + let row1 = &CompactedRow::from_bytes(data_types, row_writer1.buffer()); + let key1 = b"key1"; - assert!(builder.has_room_for_row(key1, Some(&row_writer1))); - builder.append_row(key1, Some(&row_writer1)).unwrap(); + assert!(builder.has_room_for_row(key1, Some(row1))); + builder.append_row(key1, Some(row1)).unwrap(); // Create and append second record let mut row_writer2 = CompactedRowWriter::new(2); row_writer2.write_int(100); row_writer2.write_string("world"); + let row2 = &CompactedRow::from_bytes(data_types, row_writer2.buffer()); + let key2 = b"key2"; - builder.append_row(key2, Some(&row_writer2)).unwrap(); + builder.append_row(key2, Some(row2)).unwrap(); // Append a deletion record let key3 = b"key3"; - builder - .append_row::(key3, None) - .unwrap(); + builder.append_row::(key3, None).unwrap(); // Build and verify builder.close().unwrap(); @@ -567,15 +561,18 @@ mod tests { let mut row_writer = CompactedRowWriter::new(1); row_writer.write_int(42); + let data_types = &[DataTypes::int()]; + let row = &CompactedRow::from_bytes(data_types, row_writer.buffer()); + // INDEXED format should reject append_row let mut indexed_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::INDEXED); - let result = indexed_builder.append_row(b"key", Some(&row_writer)); + let result = indexed_builder.append_row(b"key", Some(row)); assert!(result.is_err()); assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidInput); // COMPACTED format should accept append_row let mut compacted_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); - let result = compacted_builder.append_row(b"key", Some(&row_writer)); + let result = compacted_builder.append_row(b"key", Some(row)); assert!(result.is_ok()); } } diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs index 481f9be502..9ff3b5ffd5 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs @@ -15,11 +15,10 @@ // specific language governing permissions and limitations // under the License. -use std::sync::OnceLock; - use crate::metadata::DataType; use crate::row::compacted::compacted_row_reader::{CompactedRowDeserializer, CompactedRowReader}; -use crate::row::{GenericRow, InternalRow}; +use crate::row::{BinaryRow, GenericRow, InternalRow}; +use std::sync::{Arc, OnceLock}; // Reference implementation: // https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/row/compacted/CompactedRow.java @@ -28,9 +27,9 @@ pub struct CompactedRow<'a> { arity: usize, size_in_bytes: usize, decoded_row: OnceLock>, - deserializer: CompactedRowDeserializer<'a>, + deserializer: Arc>, reader: CompactedRowReader<'a>, - data_types: &'a [DataType], + data: &'a [u8], } pub fn calculate_bit_set_width_in_bytes(arity: usize) -> usize { @@ -40,15 +39,25 @@ pub fn calculate_bit_set_width_in_bytes(arity: usize) -> usize { #[allow(dead_code)] impl<'a> CompactedRow<'a> { pub fn from_bytes(data_types: &'a [DataType], data: &'a [u8]) -> Self { - let arity = data_types.len(); - let size = data.len(); + Self::deserialize( + Arc::new(CompactedRowDeserializer::new(data_types)), + data_types.len(), + data, + ) + } + + pub fn deserialize( + deserializer: Arc>, + arity: usize, + data: &'a [u8], + ) -> Self { Self { arity, - size_in_bytes: size, + size_in_bytes: data.len(), decoded_row: OnceLock::new(), - deserializer: CompactedRowDeserializer::new(data_types), - reader: CompactedRowReader::new(arity, data, 0, size), - data_types, + deserializer: Arc::clone(&deserializer), + reader: CompactedRowReader::new(arity, data, 0, data.len()), + data, } } @@ -62,6 +71,12 @@ impl<'a> CompactedRow<'a> { } } +impl BinaryRow for CompactedRow<'_> { + fn as_bytes(&self) -> &[u8] { + self.data + } +} + #[allow(dead_code)] impl<'a> InternalRow for CompactedRow<'a> { fn get_field_count(&self) -> usize { @@ -69,7 +84,7 @@ impl<'a> InternalRow for CompactedRow<'a> { } fn is_null_at(&self, pos: usize) -> bool { - self.data_types[pos].is_nullable() && self.reader.is_null_at(pos) + self.deserializer.get_data_types()[pos].is_nullable() && self.reader.is_null_at(pos) } fn get_boolean(&self, pos: usize) -> bool { @@ -120,6 +135,8 @@ impl<'a> InternalRow for CompactedRow<'a> { #[cfg(test)] mod tests { use super::*; + use crate::row::binary::BinaryWriter; + use crate::metadata::{ BigIntType, BooleanType, BytesType, DoubleType, FloatType, IntType, SmallIntType, StringType, TinyIntType, diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs index 5ec260897e..9ce50952f4 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs @@ -21,17 +21,31 @@ use crate::{ row::{Datum, GenericRow, compacted::compacted_row_writer::CompactedRowWriter}, util::varint::{read_unsigned_varint_at, read_unsigned_varint_u64_at}, }; +use std::borrow::Cow; use std::str::from_utf8; #[allow(dead_code)] +#[derive(Clone)] pub struct CompactedRowDeserializer<'a> { - schema: &'a [DataType], + schema: Cow<'a, [DataType]>, } #[allow(dead_code)] impl<'a> CompactedRowDeserializer<'a> { pub fn new(schema: &'a [DataType]) -> Self { - Self { schema } + Self { + schema: Cow::Borrowed(schema), + } + } + + pub fn new_from_owned(schema: Vec) -> Self { + Self { + schema: Cow::Owned(schema), + } + } + + pub fn get_data_types(&self) -> &[DataType] { + self.schema.as_ref() } pub fn deserialize(&self, reader: &CompactedRowReader<'a>) -> GenericRow<'a> { diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs index 63b32a3dca..c130e94cce 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs @@ -15,12 +15,11 @@ // specific language governing permissions and limitations // under the License. -use bytes::{Bytes, BytesMut}; -use std::cmp; - -use crate::row::BinaryRow; +use crate::row::binary::BinaryWriter; use crate::row::compacted::compacted_row::calculate_bit_set_width_in_bytes; use crate::util::varint::{write_unsigned_varint_to_slice, write_unsigned_varint_u64_to_slice}; +use bytes::{Bytes, BytesMut}; +use std::cmp; // Writer for CompactedRow // Reference implementation: @@ -51,11 +50,6 @@ impl CompactedRowWriter { } } - pub fn reset(&mut self) { - self.position = self.header_size_in_bytes; - self.buffer[..self.header_size_in_bytes].fill(0); - } - pub fn position(&self) -> usize { self.position } @@ -81,75 +75,78 @@ impl CompactedRowWriter { self.buffer[self.position..end].copy_from_slice(src); self.position = end; } +} +impl BinaryWriter for CompactedRowWriter { + fn reset(&mut self) { + self.position = self.header_size_in_bytes; + self.buffer[..self.header_size_in_bytes].fill(0); + } - pub fn set_null_at(&mut self, pos: usize) { + fn set_null_at(&mut self, pos: usize) { let byte_index = pos >> 3; let bit = pos & 7; debug_assert!(byte_index < self.header_size_in_bytes); self.buffer[byte_index] |= 1u8 << bit; } - pub fn write_boolean(&mut self, value: bool) { + fn write_boolean(&mut self, value: bool) { let b = if value { 1u8 } else { 0u8 }; self.write_raw(&[b]); } - pub fn write_byte(&mut self, value: u8) { + fn write_byte(&mut self, value: u8) { self.write_raw(&[value]); } - pub fn write_binary(&mut self, bytes: &[u8], length: usize) { - // TODO: currently, we encoding BINARY(length) as the same with BYTES, the length info can - // be omitted and the bytes length should be enforced in the future. - self.write_bytes(&bytes[..length.min(bytes.len())]); - } - - pub fn write_bytes(&mut self, value: &[u8]) { + fn write_bytes(&mut self, value: &[u8]) { let len_i32 = i32::try_from(value.len()).expect("byte slice too large to encode length as i32"); self.write_int(len_i32); self.write_raw(value); } - pub fn write_char(&mut self, value: &str, _length: usize) { + fn write_char(&mut self, value: &str, _length: usize) { // TODO: currently, we encoding CHAR(length) as the same with STRING, the length info can be // omitted and the bytes length should be enforced in the future. self.write_string(value); } - pub fn write_string(&mut self, value: &str) { + fn write_string(&mut self, value: &str) { self.write_bytes(value.as_ref()); } - pub fn write_short(&mut self, value: i16) { + fn write_short(&mut self, value: i16) { self.write_raw(&value.to_ne_bytes()); } - pub fn write_int(&mut self, value: i32) { + fn write_int(&mut self, value: i32) { self.ensure_capacity(Self::MAX_INT_SIZE); let bytes_written = write_unsigned_varint_to_slice(value as u32, &mut self.buffer[self.position..]); self.position += bytes_written; } - pub fn write_long(&mut self, value: i64) { + fn write_long(&mut self, value: i64) { self.ensure_capacity(Self::MAX_LONG_SIZE); let bytes_written = write_unsigned_varint_u64_to_slice(value as u64, &mut self.buffer[self.position..]); self.position += bytes_written; } - - pub fn write_float(&mut self, value: f32) { + fn write_float(&mut self, value: f32) { self.write_raw(&value.to_ne_bytes()); } - pub fn write_double(&mut self, value: f64) { + fn write_double(&mut self, value: f64) { self.write_raw(&value.to_ne_bytes()); } -} -impl BinaryRow for CompactedRowWriter { - fn as_bytes(&self) -> &[u8] { - self.buffer() + fn write_binary(&mut self, bytes: &[u8], length: usize) { + // TODO: currently, we encoding BINARY(length) as the same with BYTES, the length info can + // be omitted and the bytes length should be enforced in the future. + self.write_bytes(&bytes[..length.min(bytes.len())]); + } + + fn complete(&mut self) { + // do nothing } } diff --git a/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs b/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs new file mode 100644 index 0000000000..fc39bb7a26 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::DataType; +use crate::row::Datum; +use crate::row::binary::{BinaryRowFormat, BinaryWriter, ValueWriter}; +use crate::row::compacted::{CompactedRow, CompactedRowDeserializer, CompactedRowWriter}; +use crate::row::encode::{BinaryRow, RowEncoder}; +use std::sync::Arc; + +#[allow(dead_code)] +pub struct CompactedRowEncoder<'a> { + arity: usize, + writer: CompactedRowWriter, + field_writers: Vec, + compacted_row_deserializer: Arc>, +} + +impl<'a> CompactedRowEncoder<'a> { + pub fn new(field_data_types: Vec) -> Result { + let field_writers = field_data_types + .iter() + .map(|d| ValueWriter::create_value_writer(d, Some(&BinaryRowFormat::Compacted))) + .collect::>>()?; + + Ok(Self { + arity: field_data_types.len(), + writer: CompactedRowWriter::new(field_data_types.len()), + field_writers, + compacted_row_deserializer: Arc::new(CompactedRowDeserializer::new_from_owned( + field_data_types, + )), + }) + } +} + +impl RowEncoder for CompactedRowEncoder<'_> { + fn start_new_row(&mut self) -> Result<()> { + self.writer.reset(); + Ok(()) + } + + fn encode_field(&mut self, pos: usize, value: Datum) -> Result<()> { + self.field_writers + .get(pos) + .ok_or_else(|| IllegalArgument { + message: format!( + "invalid position {} when attempting to encode value {}", + pos, value + ), + })? + .write_value(&mut self.writer, pos, &value) + } + + fn finish_row(&mut self) -> Result { + Ok(CompactedRow::deserialize( + Arc::clone(&self.compacted_row_deserializer), + self.arity, + self.writer.buffer(), + )) + } + + fn close(&mut self) -> Result<()> { + // do nothing + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/row/encode/mod.rs b/fluss-rust/crates/fluss/src/row/encode/mod.rs index 6c6eed9936..34863aba19 100644 --- a/fluss-rust/crates/fluss/src/row/encode/mod.rs +++ b/fluss-rust/crates/fluss/src/row/encode/mod.rs @@ -16,11 +16,13 @@ // under the License. mod compacted_key_encoder; +mod compacted_row_encoder; use crate::error::Result; -use crate::metadata::{DataLakeFormat, RowType}; -use crate::row::InternalRow; +use crate::metadata::{DataLakeFormat, DataType, KvFormat, RowType}; use crate::row::encode::compacted_key_encoder::CompactedKeyEncoder; +use crate::row::encode::compacted_row_encoder::CompactedRowEncoder; +use crate::row::{BinaryRow, Datum, InternalRow}; use bytes::Bytes; /// An interface for encoding key of row into bytes. @@ -62,3 +64,65 @@ impl dyn KeyEncoder { } } } + +/// An encoder to write [`BinaryRow`]. It's used to write row +/// multi-times one by one. When writing a new row: +/// +/// 1. call method [`RowEncoder::start_new_row()`] to start the writing. +/// 2. call method [`RowEncoder::encode_field()`] to write the row's field. +/// 3. call method [`RowEncoder::finishRow()`] to finish the writing and get the written row. +#[allow(dead_code)] +pub trait RowEncoder { + /// Start to write a new row. + /// + /// # Returns + /// * Ok(()) if successful + fn start_new_row(&mut self) -> Result<()>; + + /// Write the row's field in given pos with given value. + /// + /// # Arguments + /// * pos - the position of the field to write. + /// * value - the value of the field to write. + /// + /// # Returns + /// * Ok(()) if successful + fn encode_field(&mut self, pos: usize, value: Datum) -> Result<()>; + + /// Finish write the row, returns the written row. + /// + /// Note that returned row borrows from [`RowEncoder`]'s internal buffer which is reused for subsequent rows + /// [`RowEncoder::start_new_row()`] should only be called after the returned row goes out of scope. + /// + /// # Returns + /// * the written row + fn finish_row(&mut self) -> Result; + + /// Closes the row encoder + /// + /// # Returns + /// * Ok(()) if successful + fn close(&mut self) -> Result<()>; +} + +#[allow(dead_code)] +pub struct RowEncoderFactory {} + +#[allow(dead_code)] +impl RowEncoderFactory { + pub fn create(kv_format: KvFormat, row_type: &RowType) -> Result { + Self::create_for_field_types(kv_format, row_type.field_types().cloned().collect()) + } + + pub fn create_for_field_types( + kv_format: KvFormat, + field_data_types: Vec, + ) -> Result { + match kv_format { + KvFormat::INDEXED => { + todo!() + } + KvFormat::COMPACTED => CompactedRowEncoder::new(field_data_types), + } + } +} diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index 144d64fd88..499606354c 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -19,7 +19,7 @@ mod column; mod datum; -mod binary; +pub mod binary; pub mod compacted; mod encode; mod field_getter; @@ -27,7 +27,7 @@ mod field_getter; pub use column::*; pub use datum::*; -pub trait BinaryRow { +pub trait BinaryRow: InternalRow { /// Returns the binary representation of this row as a byte slice. fn as_bytes(&self) -> &[u8]; } From c4d5b0214faf7797968b9e478482b0869b6bdb5a Mon Sep 17 00:00:00 2001 From: Anton Borisov <148864665+fresh-borzoni@users.noreply.github.com> Date: Fri, 16 Jan 2026 02:27:04 +0000 Subject: [PATCH 067/287] feat: add column projection support to Python LogScanner (#151) --- fluss-rust/bindings/python/example/example.py | 22 ++++ fluss-rust/bindings/python/src/table.rs | 112 ++++++++++++++---- 2 files changed, 109 insertions(+), 25 deletions(-) diff --git a/fluss-rust/bindings/python/example/example.py b/fluss-rust/bindings/python/example/example.py index 0523f943e4..0b1e67d3e6 100644 --- a/fluss-rust/bindings/python/example/example.py +++ b/fluss-rust/bindings/python/example/example.py @@ -178,6 +178,28 @@ async def main(): except Exception as e: print(f"Error during scanning: {e}") + # Demo: Column projection + print("\n--- Testing Column Projection ---") + try: + # Project specific columns by index + print("\n1. Projection by index [0, 1] (id, name):") + scanner_index = await table.new_log_scanner(project=[0, 1]) + scanner_index.subscribe(None, None) + df_projected = scanner_index.to_pandas() + print(df_projected.head()) + print(f" Projected {df_projected.shape[1]} columns: {list(df_projected.columns)}") + + # Project specific columns by name (Pythonic!) + print("\n2. Projection by name ['name', 'score'] (Pythonic):") + scanner_names = await table.new_log_scanner(columns=["name", "score"]) + scanner_names.subscribe(None, None) + df_named = scanner_names.to_pandas() + print(df_named.head()) + print(f" Projected {df_named.shape[1]} columns: {list(df_named.columns)}") + + except Exception as e: + print(f"Error during projection: {e}") + # Close connection conn.close() print("\nConnection closed") diff --git a/fluss-rust/bindings/python/src/table.rs b/fluss-rust/bindings/python/src/table.rs index 8a1164856b..6cd13c4fa1 100644 --- a/fluss-rust/bindings/python/src/table.rs +++ b/fluss-rust/bindings/python/src/table.rs @@ -34,6 +34,12 @@ pub struct FlussTable { has_primary_key: bool, } +/// Internal enum to represent different projection types +enum ProjectionType { + Indices(Vec), + Names(Vec), +} + #[pymethods] impl FlussTable { /// Create a new append writer for the table @@ -57,32 +63,39 @@ impl FlussTable { }) } - /// Create a new log scanner for the table - fn new_log_scanner<'py>(&self, py: Python<'py>) -> PyResult> { - let conn = self.connection.clone(); - let metadata = self.metadata.clone(); - let table_info = self.table_info.clone(); - - future_into_py(py, async move { - let fluss_table = - fcore::client::FlussTable::new(&conn, metadata.clone(), table_info.clone()); - - let table_scan = fluss_table.new_scan(); - - let rust_scanner = table_scan.create_log_scanner().map_err(|e| { - PyErr::new::(format!( - "Failed to create log scanner: {e:?}" - )) - })?; - - let admin = conn - .get_admin() - .await - .map_err(|e| FlussError::new_err(e.to_string()))?; + /// Create a new log scanner for the table. + /// + /// Args: + /// project: Optional list of column indices (0-based) to include in the scan. + /// columns: Optional list of column names to include in the scan. + /// + /// Returns: + /// LogScanner, optionally with projection applied + /// + /// Note: + /// Specify only one of 'project' or 'columns'. + /// If neither is specified, all columns are included. + /// Rust side will validate the projection parameters. + /// + #[pyo3(signature = (project=None, columns=None))] + pub fn new_log_scanner<'py>( + &self, + py: Python<'py>, + project: Option>, + columns: Option>, + ) -> PyResult> { + let projection = match (project, columns) { + (Some(_), Some(_)) => { + return Err(FlussError::new_err( + "Specify only one of 'project' or 'columns'".to_string(), + )); + } + (Some(indices), None) => Some(ProjectionType::Indices(indices)), + (None, Some(names)) => Some(ProjectionType::Names(names)), + (None, None) => None, + }; - let py_scanner = LogScanner::from_core(rust_scanner, admin, table_info.clone()); - Python::attach(|py| Py::new(py, py_scanner)) - }) + self.create_log_scanner_internal(py, projection) } /// Get table information @@ -126,6 +139,55 @@ impl FlussTable { has_primary_key, } } + + /// Internal helper to create log scanner with optional projection + fn create_log_scanner_internal<'py>( + &self, + py: Python<'py>, + projection: Option, + ) -> PyResult> { + let conn = self.connection.clone(); + let metadata = self.metadata.clone(); + let table_info = self.table_info.clone(); + + future_into_py(py, async move { + let fluss_table = + fcore::client::FlussTable::new(&conn, metadata.clone(), table_info.clone()); + + let mut table_scan = fluss_table.new_scan(); + + // Apply projection if specified + if let Some(proj) = projection { + table_scan = match proj { + ProjectionType::Indices(indices) => { + table_scan.project(&indices).map_err(|e| { + FlussError::new_err(format!("Failed to project columns: {e}")) + })? + } + ProjectionType::Names(names) => { + // Convert Vec to Vec<&str> for the API + let column_name_refs: Vec<&str> = + names.iter().map(|s| s.as_str()).collect(); + table_scan.project_by_name(&column_name_refs).map_err(|e| { + FlussError::new_err(format!("Failed to project columns: {e}")) + })? + } + }; + } + + let rust_scanner = table_scan + .create_log_scanner() + .map_err(|e| FlussError::new_err(format!("Failed to create log scanner: {e}")))?; + + let admin = conn + .get_admin() + .await + .map_err(|e| FlussError::new_err(e.to_string()))?; + + let py_scanner = LogScanner::from_core(rust_scanner, admin, table_info.clone()); + Python::attach(|py| Py::new(py, py_scanner)) + }) + } } /// Writer for appending data to a Fluss table From dce62d6d4a0fa9d4a38ee1aec23a57b8dfe35b18 Mon Sep 17 00:00:00 2001 From: AlexZhao Date: Fri, 16 Jan 2026 14:22:48 +0800 Subject: [PATCH 068/287] chore: fix read deadlock (#165) --- .../src/client/table/log_fetch_buffer.rs | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs index e9bac53f1a..c55c994b03 100644 --- a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs +++ b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs @@ -211,19 +211,27 @@ impl LogFetchBuffer { pub fn buffered_buckets(&self) -> Vec { let mut buckets = Vec::new(); - let next_in_line_fetch = self.next_in_line_fetch.lock(); - if let Some(complete_fetch) = next_in_line_fetch.as_ref() { - if !complete_fetch.is_consumed() { - buckets.push(complete_fetch.table_bucket().clone()); + // Avoid holding multiple locks at once to prevent lock-order inversion. + { + let next_in_line_fetch = self.next_in_line_fetch.lock(); + if let Some(complete_fetch) = next_in_line_fetch.as_ref() { + if !complete_fetch.is_consumed() { + buckets.push(complete_fetch.table_bucket().clone()); + } } } - let completed = self.completed_fetches.lock(); - for fetch in completed.iter() { - buckets.push(fetch.table_bucket().clone()); + { + let completed = self.completed_fetches.lock(); + for fetch in completed.iter() { + buckets.push(fetch.table_bucket().clone()); + } + } + + { + let pending = self.pending_fetches.lock(); + buckets.extend(pending.keys().cloned()); } - let pending = self.pending_fetches.lock(); - buckets.extend(pending.keys().cloned()); buckets } } From 93048ed0d783987da266904ee216d5506e0b2204 Mon Sep 17 00:00:00 2001 From: Andrea Bozzo Date: Sat, 17 Jan 2026 02:45:00 +0100 Subject: [PATCH 069/287] feat: introduce lookup support for primary key tables (#159) --- .../crates/fluss/src/client/table/lookup.rs | 252 ++++++++++++++++++ .../crates/fluss/src/client/table/mod.rs | 38 ++- .../fluss/src/client/write/write_format.rs | 4 +- fluss-rust/crates/fluss/src/metadata/table.rs | 1 + .../crates/fluss/src/proto/fluss_api.proto | 28 ++ .../crates/fluss/src/record/kv/kv_record.rs | 12 +- .../fluss/src/record/kv/kv_record_batch.rs | 15 +- .../src/record/kv/kv_record_batch_builder.rs | 30 +-- .../fluss/src/row/compacted/compacted_row.rs | 58 ++-- .../src/row/compacted/compacted_row_reader.rs | 18 +- .../src/row/encode/compacted_row_encoder.rs | 19 +- fluss-rust/crates/fluss/src/row/encode/mod.rs | 8 +- fluss-rust/crates/fluss/src/row/mod.rs | 4 +- fluss-rust/crates/fluss/src/rpc/api_key.rs | 4 + .../crates/fluss/src/rpc/message/lookup.rs | 67 +++++ .../crates/fluss/src/rpc/message/mod.rs | 2 + fluss-rust/crates/fluss/src/util/varint.rs | 14 +- 17 files changed, 478 insertions(+), 96 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/client/table/lookup.rs create mode 100644 fluss-rust/crates/fluss/src/rpc/message/lookup.rs diff --git a/fluss-rust/crates/fluss/src/client/table/lookup.rs b/fluss-rust/crates/fluss/src/client/table/lookup.rs new file mode 100644 index 0000000000..1d32ebd75e --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/lookup.rs @@ -0,0 +1,252 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::bucketing::BucketingFunction; +use crate::client::connection::FlussConnection; +use crate::client::metadata::Metadata; +use crate::error::{Error, Result}; +use crate::metadata::{RowType, TableBucket, TableInfo}; +use crate::row::InternalRow; +use crate::row::compacted::CompactedRow; +use crate::row::encode::KeyEncoder; +use crate::rpc::ApiError; +use crate::rpc::message::LookupRequest; +use std::sync::Arc; + +/// The result of a lookup operation. +/// +/// Contains the rows returned from a lookup. For primary key lookups, +/// this will contain at most one row. For prefix key lookups (future), +/// this may contain multiple rows. +pub struct LookupResult<'a> { + rows: Vec>, + row_type: &'a RowType, +} + +impl<'a> LookupResult<'a> { + /// Creates a new LookupResult from a list of row bytes. + fn new(rows: Vec>, row_type: &'a RowType) -> Self { + Self { rows, row_type } + } + + /// Creates an empty LookupResult. + fn empty(row_type: &'a RowType) -> Self { + Self { + rows: Vec::new(), + row_type, + } + } + + /// Returns the only row in the result set as a [`CompactedRow`]. + /// + /// This method provides a zero-copy view of the row data, which means the returned + /// `CompactedRow` borrows from this result set and cannot outlive it. + /// + /// # Returns + /// - `Ok(Some(row))`: If exactly one row exists. + /// - `Ok(None)`: If the result set is empty. + /// - `Err(Error::UnexpectedError)`: If the result set contains more than one row. + /// + pub fn get_single_row(&self) -> Result>> { + match self.rows.len() { + 0 => Ok(None), + 1 => Ok(Some(CompactedRow::from_bytes(self.row_type, &self.rows[0]))), + _ => Err(Error::UnexpectedError { + message: "LookupResult contains multiple rows, use get_rows() instead".to_string(), + source: None, + }), + } + } + + /// Returns all rows as CompactedRows. + pub fn get_rows(&self) -> Vec> { + self.rows + .iter() + .map(|bytes| CompactedRow::from_bytes(self.row_type, bytes)) + .collect() + } +} + +/// Configuration and factory struct for creating lookup operations. +/// +/// `TableLookup` follows the same pattern as `TableScan` and `TableAppend`, +/// providing a builder-style API for configuring lookup operations before +/// creating the actual `Lookuper`. +/// +/// # Example +/// ```ignore +/// let table = conn.get_table(&table_path).await?; +/// let lookuper = table.new_lookup()?.create_lookuper()?; +/// let result = lookuper.lookup(&row).await?; +/// if let Some(value) = result.get_single_row() { +/// println!("Found: {:?}", value); +/// } +/// ``` +// TODO: Add lookup_by(column_names) for prefix key lookups (PrefixKeyLookuper) +// TODO: Add create_typed_lookuper() for typed lookups with POJO mapping +pub struct TableLookup<'a> { + conn: &'a FlussConnection, + table_info: TableInfo, + metadata: Arc, +} + +impl<'a> TableLookup<'a> { + pub(super) fn new( + conn: &'a FlussConnection, + table_info: TableInfo, + metadata: Arc, + ) -> Self { + Self { + conn, + table_info, + metadata, + } + } + + /// Creates a `Lookuper` for performing key-based lookups. + /// + /// The lookuper will automatically encode the key and compute the bucket + /// for each lookup using the appropriate bucketing function. + pub fn create_lookuper(self) -> Result> { + let num_buckets = self.table_info.get_num_buckets(); + + // Get data lake format from table config for bucketing function + let data_lake_format = self.table_info.get_table_config().get_datalake_format()?; + let bucketing_function = ::of(data_lake_format.as_ref()); + + // Create key encoder for the primary key fields + let pk_fields = self.table_info.get_physical_primary_keys().to_vec(); + let key_encoder = + ::of(self.table_info.row_type(), pk_fields, data_lake_format)?; + + Ok(Lookuper { + conn: self.conn, + table_info: self.table_info, + metadata: self.metadata, + bucketing_function, + key_encoder, + num_buckets, + }) + } +} + +/// Performs key-based lookups against a primary key table. +/// +/// The `Lookuper` automatically encodes the lookup key, computes the target +/// bucket, finds the appropriate tablet server, and retrieves the value. +/// +/// # Example +/// ```ignore +/// let lookuper = table.new_lookup()?.create_lookuper()?; +/// let row = GenericRow::new(vec![Datum::Int32(42)]); // lookup key +/// let result = lookuper.lookup(&row).await?; +/// ``` +// TODO: Support partitioned tables (extract partition from key) +pub struct Lookuper<'a> { + conn: &'a FlussConnection, + table_info: TableInfo, + metadata: Arc, + bucketing_function: Box, + key_encoder: Box, + num_buckets: i32, +} + +impl<'a> Lookuper<'a> { + /// Looks up a value by its primary key. + /// + /// The key is encoded and the bucket is automatically computed using + /// the table's bucketing function. + /// + /// # Arguments + /// * `row` - The row containing the primary key field values + /// + /// # Returns + /// * `Ok(LookupResult)` - The lookup result (may be empty if key not found) + /// * `Err(Error)` - If the lookup fails + pub async fn lookup(&mut self, row: &dyn InternalRow) -> Result> { + // todo: support batch lookup + // Encode the key from the row + let encoded_key = self.key_encoder.encode_key(row)?; + let key_bytes = encoded_key.to_vec(); + + // Compute bucket from encoded key + let bucket_id = self + .bucketing_function + .bucketing(&key_bytes, self.num_buckets)?; + + let table_id = self.table_info.get_table_id(); + let table_bucket = TableBucket::new(table_id, bucket_id); + + // Find the leader for this bucket + let cluster = self.metadata.get_cluster(); + let leader = + cluster + .leader_for(&table_bucket) + .ok_or_else(|| Error::LeaderNotAvailable { + message: format!("No leader found for table bucket: {table_bucket}"), + })?; + + // Get connection to the tablet server + let tablet_server = + cluster + .get_tablet_server(leader.id()) + .ok_or_else(|| Error::LeaderNotAvailable { + message: format!( + "Tablet server {} is not found in metadata cache", + leader.id() + ), + })?; + + let connections = self.conn.get_connections(); + let connection = connections.get_connection(tablet_server).await?; + + // Send lookup request + let request = LookupRequest::new(table_id, None, bucket_id, vec![key_bytes]); + let response = connection.request(request).await?; + + // Extract the values from response + if let Some(bucket_resp) = response.buckets_resp.into_iter().next() { + // Check for errors + if let Some(error_code) = bucket_resp.error_code { + if error_code != 0 { + return Err(Error::FlussAPIError { + api_error: ApiError { + code: error_code, + message: bucket_resp.error_message.unwrap_or_default(), + }, + }); + } + } + + // Collect all values + let rows: Vec> = bucket_resp + .values + .into_iter() + .filter_map(|pb_value| pb_value.values) + .collect(); + + return Ok(LookupResult::new(rows, self.table_info.row_type())); + } + + Ok(LookupResult::empty(self.table_info.row_type())) + } + + /// Returns a reference to the table info. + pub fn table_info(&self) -> &TableInfo { + &self.table_info + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/mod.rs b/fluss-rust/crates/fluss/src/client/table/mod.rs index 26341d70a6..7356be2393 100644 --- a/fluss-rust/crates/fluss/src/client/table/mod.rs +++ b/fluss-rust/crates/fluss/src/client/table/mod.rs @@ -17,14 +17,14 @@ use crate::client::connection::FlussConnection; use crate::client::metadata::Metadata; +use crate::error::{Error, Result}; use crate::metadata::{TableInfo, TablePath}; use std::sync::Arc; -use crate::error::Result; - pub const EARLIEST_OFFSET: i64 = -2; mod append; +mod lookup; mod log_fetch_buffer; mod remote_log; @@ -32,6 +32,7 @@ mod scanner; mod writer; pub use append::{AppendWriter, TableAppend}; +pub use lookup::{LookupResult, Lookuper, TableLookup}; pub use scanner::{LogScanner, RecordBatchLogScanner, TableScan}; #[allow(dead_code)] @@ -85,6 +86,39 @@ impl<'a> FlussTable<'a> { pub fn has_primary_key(&self) -> bool { self.has_primary_key } + + /// Creates a new `TableLookup` for configuring lookup operations. + /// + /// This follows the same pattern as `new_scan()` and `new_append()`, + /// returning a configuration object that can be used to create a `Lookuper`. + /// + /// The table must have a primary key (be a primary key table). + /// + /// # Returns + /// * `Ok(TableLookup)` - A lookup configuration object + /// * `Err(Error)` - If the table doesn't have a primary key + /// + /// # Example + /// ```ignore + /// let table = conn.get_table(&table_path).await?; + /// let lookuper = table.new_lookup()?.create_lookuper()?; + /// let key = vec![1, 2, 3]; // encoded primary key bytes + /// if let Some(value) = lookuper.lookup(key).await? { + /// println!("Found value: {:?}", value); + /// } + /// ``` + pub fn new_lookup(&self) -> Result> { + if !self.has_primary_key { + return Err(Error::UnsupportedOperation { + message: "Lookup is only supported for primary key tables".to_string(), + }); + } + Ok(TableLookup::new( + self.conn, + self.table_info.clone(), + self.metadata.clone(), + )) + } } impl<'a> Drop for FlussTable<'a> { diff --git a/fluss-rust/crates/fluss/src/client/write/write_format.rs b/fluss-rust/crates/fluss/src/client/write/write_format.rs index d65e42de68..4a0c0d8afa 100644 --- a/fluss-rust/crates/fluss/src/client/write/write_format.rs +++ b/fluss-rust/crates/fluss/src/client/write/write_format.rs @@ -39,7 +39,7 @@ impl WriteFormat { match self { WriteFormat::CompactedKv => Ok(KvFormat::COMPACTED), other => Err(IllegalArgument { - message: format!("WriteFormat `{}` is not a KvFormat", other), + message: format!("WriteFormat `{other}` is not a KvFormat"), }), } } @@ -48,7 +48,7 @@ impl WriteFormat { match kv_format { KvFormat::COMPACTED => Ok(WriteFormat::CompactedKv), other => Err(IllegalArgument { - message: format!("Unknown KvFormat: `{}`", other), + message: format!("Unknown KvFormat: `{other}`"), }), } } diff --git a/fluss-rust/crates/fluss/src/metadata/table.rs b/fluss-rust/crates/fluss/src/metadata/table.rs index b1e8a90ba2..da85b0c2da 100644 --- a/fluss-rust/crates/fluss/src/metadata/table.rs +++ b/fluss-rust/crates/fluss/src/metadata/table.rs @@ -729,6 +729,7 @@ impl TableConfig { ArrowCompressionInfo::from_conf(&self.properties) } + /// Returns the data lake format if configured, or None if not set. pub fn get_datalake_format(&self) -> Result> { self.properties .get("table.datalake.format") diff --git a/fluss-rust/crates/fluss/src/proto/fluss_api.proto b/fluss-rust/crates/fluss/src/proto/fluss_api.proto index dbbb45daea..b4ae8405aa 100644 --- a/fluss-rust/crates/fluss/src/proto/fluss_api.proto +++ b/fluss-rust/crates/fluss/src/proto/fluss_api.proto @@ -317,4 +317,32 @@ message GetFileSystemSecurityTokenResponse { required bytes token = 2; optional int64 expiration_time = 3; repeated PbKeyValue addition_info = 4; +} + +// lookup request and response +message LookupRequest { + required int64 table_id = 1; + repeated PbLookupReqForBucket buckets_req = 2; +} + +message LookupResponse { + repeated PbLookupRespForBucket buckets_resp = 1; +} + +message PbLookupReqForBucket { + optional int64 partition_id = 1; + required int32 bucket_id = 2; + repeated bytes key = 3; +} + +message PbLookupRespForBucket { + optional int64 partition_id = 1; + required int32 bucket_id = 2; + optional int32 error_code = 3; + optional string error_message = 4; + repeated PbValue values = 5; +} + +message PbValue { + optional bytes values = 1; } \ No newline at end of file diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record.rs index 8c30713d42..ab8c2ac1dd 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record.rs @@ -101,7 +101,7 @@ impl KvRecord { let size_i32 = i32::try_from(size_in_bytes).map_err(|_| { io::Error::new( io::ErrorKind::InvalidInput, - format!("Record size {} exceeds i32::MAX", size_in_bytes), + format!("Record size {size_in_bytes} exceeds i32::MAX"), ) })?; buf.put_i32_le(size_i32); @@ -141,7 +141,7 @@ impl KvRecord { if size_in_bytes_i32 < 0 { return Err(io::Error::new( io::ErrorKind::InvalidData, - format!("Invalid record length: {}", size_in_bytes_i32), + format!("Invalid record length: {size_in_bytes_i32}"), )); } @@ -150,10 +150,7 @@ impl KvRecord { let total_size = size_in_bytes.checked_add(LENGTH_LENGTH).ok_or_else(|| { io::Error::new( io::ErrorKind::InvalidData, - format!( - "Record size overflow: {} + {}", - size_in_bytes, LENGTH_LENGTH - ), + format!("Record size overflow: {size_in_bytes} + {LENGTH_LENGTH}"), ) })?; @@ -162,8 +159,7 @@ impl KvRecord { return Err(io::Error::new( io::ErrorKind::UnexpectedEof, format!( - "Not enough bytes to read record: expected {}, available {}", - total_size, available + "Not enough bytes to read record: expected {total_size}, available {available}" ), )); } diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs index 6ead64276a..eb3c09ad34 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs @@ -96,7 +96,7 @@ impl KvRecordBatch { if length_i32 < 0 { return Err(io::Error::new( io::ErrorKind::InvalidData, - format!("Invalid batch length: {}", length_i32), + format!("Invalid batch length: {length_i32}"), )); } @@ -150,10 +150,7 @@ impl KvRecordBatch { if size < RECORD_BATCH_HEADER_SIZE { return Err(io::Error::new( io::ErrorKind::InvalidData, - format!( - "Batch size {} is less than header size {}", - size, RECORD_BATCH_HEADER_SIZE - ), + format!("Batch size {size} is less than header size {RECORD_BATCH_HEADER_SIZE}"), )); } @@ -276,7 +273,7 @@ impl KvRecordBatch { if count < 0 { return Err(io::Error::new( io::ErrorKind::InvalidData, - format!("Invalid record count: {}", count), + format!("Invalid record count: {count}"), )); } Ok(KvRecordIterator { @@ -321,7 +318,7 @@ impl Iterator for KvRecordIterator { #[cfg(test)] mod tests { use super::*; - use crate::metadata::{DataTypes, KvFormat}; + use crate::metadata::{DataTypes, KvFormat, RowType}; use crate::record::kv::{CURRENT_KV_MAGIC_VALUE, KvRecordBatchBuilder}; use crate::row::binary::BinaryWriter; use crate::row::compacted::CompactedRow; @@ -366,8 +363,8 @@ mod tests { let mut value1_writer = CompactedRowWriter::new(1); value1_writer.write_bytes(&[1, 2, 3, 4, 5]); - let data_types = &[DataTypes::bytes()]; - let row = &CompactedRow::from_bytes(data_types, value1_writer.buffer()); + let row_type = RowType::with_data_types([DataTypes::bytes()].to_vec()); + let row = &CompactedRow::from_bytes(&row_type, value1_writer.buffer()); builder.append_row(key1, Some(row)).unwrap(); let key2 = b"key2"; diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs index 7d1a7972e4..c36a86121b 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs @@ -248,7 +248,7 @@ impl KvRecordBatchBuilder { let total_size = i32::try_from(size_without_length).map_err(|_| { io::Error::new( io::ErrorKind::InvalidInput, - format!("Batch size {} exceeds i32::MAX", size_without_length), + format!("Batch size {size_without_length} exceeds i32::MAX"), ) })?; @@ -317,14 +317,16 @@ impl Drop for KvRecordBatchBuilder { #[cfg(test)] mod tests { use super::*; - use crate::metadata::{DataType, DataTypes}; + use crate::metadata::{DataTypes, RowType}; use crate::row::binary::BinaryWriter; use crate::row::compacted::{CompactedRow, CompactedRowWriter}; + use std::sync::LazyLock; + static TEST_ROW_TYPE: LazyLock = + LazyLock::new(|| RowType::with_data_types(vec![DataTypes::bytes()])); // Helper function to create a CompactedRowWriter with a single bytes field for testing fn create_test_row(data: &[u8]) -> CompactedRow<'_> { - const DATA_TYPE: &[DataType] = &[DataTypes::bytes()]; - CompactedRow::from_bytes(DATA_TYPE, data) + CompactedRow::from_bytes(&TEST_ROW_TYPE, data) } #[test] @@ -483,7 +485,6 @@ mod tests { #[test] fn test_builder_with_compacted_row_writer() { - use crate::metadata::{DataType, IntType, StringType}; use crate::record::kv::KvRecordBatch; use crate::row::InternalRow; use crate::row::compacted::CompactedRow; @@ -491,18 +492,13 @@ mod tests { let mut builder = KvRecordBatchBuilder::new(1, 100000, KvFormat::COMPACTED); builder.set_writer_state(100, 5); - let types = vec![ - DataType::Int(IntType::new()), - DataType::String(StringType::new()), - ]; - // Create and append first record with CompactedRowWriter let mut row_writer1 = CompactedRowWriter::new(2); row_writer1.write_int(42); row_writer1.write_string("hello"); - let data_types = &[DataTypes::int(), DataTypes::string()]; - let row1 = &CompactedRow::from_bytes(data_types, row_writer1.buffer()); + let row_type = RowType::with_data_types([DataTypes::int(), DataTypes::string()].to_vec()); + let row1 = &CompactedRow::from_bytes(&row_type, row_writer1.buffer()); let key1 = b"key1"; assert!(builder.has_room_for_row(key1, Some(row1))); @@ -513,7 +509,7 @@ mod tests { row_writer2.write_int(100); row_writer2.write_string("world"); - let row2 = &CompactedRow::from_bytes(data_types, row_writer2.buffer()); + let row2 = &CompactedRow::from_bytes(&row_type, row_writer2.buffer()); let key2 = b"key2"; builder.append_row(key2, Some(row2)).unwrap(); @@ -539,14 +535,14 @@ mod tests { // Verify first record let record1 = records[0].as_ref().unwrap(); assert_eq!(record1.key().as_ref(), key1); - let row1 = CompactedRow::from_bytes(&types, record1.value().unwrap()); + let row1 = CompactedRow::from_bytes(&row_type, record1.value().unwrap()); assert_eq!(row1.get_int(0), 42); assert_eq!(row1.get_string(1), "hello"); // Verify second record let record2 = records[1].as_ref().unwrap(); assert_eq!(record2.key().as_ref(), key2); - let row2 = CompactedRow::from_bytes(&types, record2.value().unwrap()); + let row2 = CompactedRow::from_bytes(&row_type, record2.value().unwrap()); assert_eq!(row2.get_int(0), 100); assert_eq!(row2.get_string(1), "world"); @@ -561,8 +557,8 @@ mod tests { let mut row_writer = CompactedRowWriter::new(1); row_writer.write_int(42); - let data_types = &[DataTypes::int()]; - let row = &CompactedRow::from_bytes(data_types, row_writer.buffer()); + let row_type = RowType::with_data_types([DataTypes::int()].to_vec()); + let row = &CompactedRow::from_bytes(&row_type, row_writer.buffer()); // INDEXED format should reject append_row let mut indexed_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::INDEXED); diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs index 9ff3b5ffd5..144f8985cf 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::metadata::DataType; +use crate::metadata::RowType; use crate::row::compacted::compacted_row_reader::{CompactedRowDeserializer, CompactedRowReader}; use crate::row::{BinaryRow, GenericRow, InternalRow}; use std::sync::{Arc, OnceLock}; @@ -38,10 +38,10 @@ pub fn calculate_bit_set_width_in_bytes(arity: usize) -> usize { #[allow(dead_code)] impl<'a> CompactedRow<'a> { - pub fn from_bytes(data_types: &'a [DataType], data: &'a [u8]) -> Self { + pub fn from_bytes(row_type: &'a RowType, data: &'a [u8]) -> Self { Self::deserialize( - Arc::new(CompactedRowDeserializer::new(data_types)), - data_types.len(), + Arc::new(CompactedRowDeserializer::new(row_type)), + row_type.fields().len(), data, ) } @@ -84,7 +84,10 @@ impl<'a> InternalRow for CompactedRow<'a> { } fn is_null_at(&self, pos: usize) -> bool { - self.deserializer.get_data_types()[pos].is_nullable() && self.reader.is_null_at(pos) + self.deserializer.get_row_type().fields().as_slice()[pos] + .data_type + .is_nullable() + && self.reader.is_null_at(pos) } fn get_boolean(&self, pos: usize) -> bool { @@ -138,7 +141,7 @@ mod tests { use crate::row::binary::BinaryWriter; use crate::metadata::{ - BigIntType, BooleanType, BytesType, DoubleType, FloatType, IntType, SmallIntType, + BigIntType, BooleanType, BytesType, DataType, DoubleType, FloatType, IntType, SmallIntType, StringType, TinyIntType, }; use crate::row::compacted::compacted_row_writer::CompactedRowWriter; @@ -146,7 +149,7 @@ mod tests { #[test] fn test_compacted_row() { // Test all primitive types - let types = vec![ + let row_type = RowType::with_data_types(vec![ DataType::Boolean(BooleanType::new()), DataType::TinyInt(TinyIntType::new()), DataType::SmallInt(SmallIntType::new()), @@ -156,9 +159,9 @@ mod tests { DataType::Double(DoubleType::new()), DataType::String(StringType::new()), DataType::Bytes(BytesType::new()), - ]; + ]); - let mut writer = CompactedRowWriter::new(types.len()); + let mut writer = CompactedRowWriter::new(row_type.fields().len()); writer.write_boolean(true); writer.write_byte(1); @@ -171,7 +174,7 @@ mod tests { writer.write_bytes(&[1, 2, 3, 4, 5]); let bytes = writer.to_bytes(); - let mut row = CompactedRow::from_bytes(types.as_slice(), bytes.as_ref()); + let mut row = CompactedRow::from_bytes(&row_type, bytes.as_ref()); assert_eq!(row.get_field_count(), 9); assert!(row.get_boolean(0)); @@ -185,20 +188,23 @@ mod tests { assert_eq!(row.get_bytes(8), &[1, 2, 3, 4, 5]); // Test with nulls - let types = vec![ - DataType::Int(IntType::new()), - DataType::String(StringType::new()), - DataType::Double(DoubleType::new()), - ]; + let row_type = RowType::with_data_types( + [ + DataType::Int(IntType::new()), + DataType::String(StringType::new()), + DataType::Double(DoubleType::new()), + ] + .to_vec(), + ); - let mut writer = CompactedRowWriter::new(types.len()); + let mut writer = CompactedRowWriter::new(row_type.fields().len()); writer.write_int(100); writer.set_null_at(1); writer.write_double(2.71); let bytes = writer.to_bytes(); - row = CompactedRow::from_bytes(types.as_slice(), bytes.as_ref()); + row = CompactedRow::from_bytes(&row_type, bytes.as_ref()); assert!(!row.is_null_at(0)); assert!(row.is_null_at(1)); @@ -211,26 +217,28 @@ mod tests { assert_eq!(row.get_int(0), 100); // Test from_bytes - let types = vec![ + let row_type = RowType::with_data_types(vec![ DataType::Int(IntType::new()), DataType::String(StringType::new()), - ]; + ]); - let mut writer = CompactedRowWriter::new(types.len()); + let mut writer = CompactedRowWriter::new(row_type.fields().len()); writer.write_int(-1); writer.write_string("test"); let bytes = writer.to_bytes(); - let mut row = CompactedRow::from_bytes(types.as_slice(), bytes.as_ref()); + let mut row = CompactedRow::from_bytes(&row_type, bytes.as_ref()); assert_eq!(row.get_int(0), -1); assert_eq!(row.get_string(1), "test"); // Test large row let num_fields = 100; - let types: Vec = (0..num_fields) - .map(|_| DataType::Int(IntType::new())) - .collect(); + let row_type = RowType::with_data_types( + (0..num_fields) + .map(|_| DataType::Int(IntType::new())) + .collect(), + ); let mut writer = CompactedRowWriter::new(num_fields); @@ -239,7 +247,7 @@ mod tests { } let bytes = writer.to_bytes(); - row = CompactedRow::from_bytes(types.as_slice(), bytes.as_ref()); + row = CompactedRow::from_bytes(&row_type, bytes.as_ref()); for i in 0..num_fields { assert_eq!(row.get_int(i), (i * 10) as i32); diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs index 9ce50952f4..408706cc83 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::metadata::RowType; use crate::row::compacted::compacted_row::calculate_bit_set_width_in_bytes; use crate::{ metadata::DataType, @@ -27,31 +28,32 @@ use std::str::from_utf8; #[allow(dead_code)] #[derive(Clone)] pub struct CompactedRowDeserializer<'a> { - schema: Cow<'a, [DataType]>, + row_type: Cow<'a, RowType>, } #[allow(dead_code)] impl<'a> CompactedRowDeserializer<'a> { - pub fn new(schema: &'a [DataType]) -> Self { + pub fn new(row_type: &'a RowType) -> Self { Self { - schema: Cow::Borrowed(schema), + row_type: Cow::Borrowed(row_type), } } - pub fn new_from_owned(schema: Vec) -> Self { + pub fn new_from_owned(row_type: RowType) -> Self { Self { - schema: Cow::Owned(schema), + row_type: Cow::Owned(row_type), } } - pub fn get_data_types(&self) -> &[DataType] { - self.schema.as_ref() + pub fn get_row_type(&self) -> &RowType { + self.row_type.as_ref() } pub fn deserialize(&self, reader: &CompactedRowReader<'a>) -> GenericRow<'a> { let mut row = GenericRow::new(); let mut cursor = reader.initial_position(); - for (col_pos, dtype) in self.schema.iter().enumerate() { + for (col_pos, data_field) in self.row_type.fields().iter().enumerate() { + let dtype = &data_field.data_type; if dtype.is_nullable() && reader.is_null_at(col_pos) { row.set_field(col_pos, Datum::Null); continue; diff --git a/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs b/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs index fc39bb7a26..48b9f3ff58 100644 --- a/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs +++ b/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs @@ -17,7 +17,7 @@ use crate::error::Error::IllegalArgument; use crate::error::Result; -use crate::metadata::DataType; +use crate::metadata::RowType; use crate::row::Datum; use crate::row::binary::{BinaryRowFormat, BinaryWriter, ValueWriter}; use crate::row::compacted::{CompactedRow, CompactedRowDeserializer, CompactedRowWriter}; @@ -33,18 +33,18 @@ pub struct CompactedRowEncoder<'a> { } impl<'a> CompactedRowEncoder<'a> { - pub fn new(field_data_types: Vec) -> Result { - let field_writers = field_data_types - .iter() + pub fn new(row_type: RowType) -> Result { + let field_writers = row_type + .field_types() .map(|d| ValueWriter::create_value_writer(d, Some(&BinaryRowFormat::Compacted))) .collect::>>()?; Ok(Self { - arity: field_data_types.len(), - writer: CompactedRowWriter::new(field_data_types.len()), + arity: field_writers.len(), + writer: CompactedRowWriter::new(field_writers.len()), field_writers, compacted_row_deserializer: Arc::new(CompactedRowDeserializer::new_from_owned( - field_data_types, + row_type, )), }) } @@ -60,10 +60,7 @@ impl RowEncoder for CompactedRowEncoder<'_> { self.field_writers .get(pos) .ok_or_else(|| IllegalArgument { - message: format!( - "invalid position {} when attempting to encode value {}", - pos, value - ), + message: format!("invalid position {pos} when attempting to encode value {value}"), })? .write_value(&mut self.writer, pos, &value) } diff --git a/fluss-rust/crates/fluss/src/row/encode/mod.rs b/fluss-rust/crates/fluss/src/row/encode/mod.rs index 34863aba19..c294ecf1d1 100644 --- a/fluss-rust/crates/fluss/src/row/encode/mod.rs +++ b/fluss-rust/crates/fluss/src/row/encode/mod.rs @@ -19,7 +19,7 @@ mod compacted_key_encoder; mod compacted_row_encoder; use crate::error::Result; -use crate::metadata::{DataLakeFormat, DataType, KvFormat, RowType}; +use crate::metadata::{DataLakeFormat, KvFormat, RowType}; use crate::row::encode::compacted_key_encoder::CompactedKeyEncoder; use crate::row::encode::compacted_row_encoder::CompactedRowEncoder; use crate::row::{BinaryRow, Datum, InternalRow}; @@ -111,18 +111,18 @@ pub struct RowEncoderFactory {} #[allow(dead_code)] impl RowEncoderFactory { pub fn create(kv_format: KvFormat, row_type: &RowType) -> Result { - Self::create_for_field_types(kv_format, row_type.field_types().cloned().collect()) + Self::create_for_field_types(kv_format, row_type.clone()) } pub fn create_for_field_types( kv_format: KvFormat, - field_data_types: Vec, + row_type: RowType, ) -> Result { match kv_format { KvFormat::INDEXED => { todo!() } - KvFormat::COMPACTED => CompactedRowEncoder::new(field_data_types), + KvFormat::COMPACTED => CompactedRowEncoder::new(row_type), } } } diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index 499606354c..3477f1de20 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -21,11 +21,13 @@ mod datum; pub mod binary; pub mod compacted; -mod encode; +pub mod encode; mod field_getter; pub use column::*; +pub use compacted::CompactedRow; pub use datum::*; +pub use encode::KeyEncoder; pub trait BinaryRow: InternalRow { /// Returns the binary representation of this row as a byte slice. diff --git a/fluss-rust/crates/fluss/src/rpc/api_key.rs b/fluss-rust/crates/fluss/src/rpc/api_key.rs index c51539642b..9f9268e857 100644 --- a/fluss-rust/crates/fluss/src/rpc/api_key.rs +++ b/fluss-rust/crates/fluss/src/rpc/api_key.rs @@ -31,6 +31,7 @@ pub enum ApiKey { MetaData, ProduceLog, FetchLog, + Lookup, ListOffsets, GetFileSystemSecurityToken, GetDatabaseInfo, @@ -53,6 +54,7 @@ impl From for ApiKey { 1012 => ApiKey::MetaData, 1014 => ApiKey::ProduceLog, 1015 => ApiKey::FetchLog, + 1017 => ApiKey::Lookup, 1021 => ApiKey::ListOffsets, 1025 => ApiKey::GetFileSystemSecurityToken, 1032 => ApiKey::GetLatestLakeSnapshot, @@ -77,6 +79,7 @@ impl From for i16 { ApiKey::MetaData => 1012, ApiKey::ProduceLog => 1014, ApiKey::FetchLog => 1015, + ApiKey::Lookup => 1017, ApiKey::ListOffsets => 1021, ApiKey::GetFileSystemSecurityToken => 1025, ApiKey::GetLatestLakeSnapshot => 1032, @@ -105,6 +108,7 @@ mod tests { (1012, ApiKey::MetaData), (1014, ApiKey::ProduceLog), (1015, ApiKey::FetchLog), + (1017, ApiKey::Lookup), (1021, ApiKey::ListOffsets), (1025, ApiKey::GetFileSystemSecurityToken), (1032, ApiKey::GetLatestLakeSnapshot), diff --git a/fluss-rust/crates/fluss/src/rpc/message/lookup.rs b/fluss-rust/crates/fluss/src/rpc/message/lookup.rs new file mode 100644 index 0000000000..3de47d64dd --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/lookup.rs @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::proto::LookupResponse; +use crate::rpc::frame::ReadError; + +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::WriteError; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; +use crate::{impl_read_version_type, impl_write_version_type, proto}; +use prost::Message; + +use bytes::{Buf, BufMut}; + +pub struct LookupRequest { + pub inner_request: proto::LookupRequest, +} + +impl LookupRequest { + pub fn new( + table_id: i64, + partition_id: Option, + bucket_id: i32, + keys: Vec>, + ) -> Self { + let bucket_req = proto::PbLookupReqForBucket { + partition_id, + bucket_id, + key: keys, + }; + + let request = proto::LookupRequest { + table_id, + buckets_req: vec![bucket_req], + }; + + Self { + inner_request: request, + } + } +} + +impl RequestBody for LookupRequest { + type ResponseBody = LookupResponse; + + const API_KEY: ApiKey = ApiKey::Lookup; + + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(LookupRequest); +impl_read_version_type!(LookupResponse); diff --git a/fluss-rust/crates/fluss/src/rpc/message/mod.rs b/fluss-rust/crates/fluss/src/rpc/message/mod.rs index b619ee4023..2fe506bc37 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/mod.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/mod.rs @@ -34,6 +34,7 @@ mod header; mod list_databases; mod list_offsets; mod list_tables; +mod lookup; mod produce_log; mod table_exists; mod update_metadata; @@ -53,6 +54,7 @@ pub use header::*; pub use list_databases::*; pub use list_offsets::*; pub use list_tables::*; +pub use lookup::*; pub use produce_log::*; pub use table_exists::*; pub use update_metadata::*; diff --git a/fluss-rust/crates/fluss/src/util/varint.rs b/fluss-rust/crates/fluss/src/util/varint.rs index 96fd1f50bf..83a75f6c37 100644 --- a/fluss-rust/crates/fluss/src/util/varint.rs +++ b/fluss-rust/crates/fluss/src/util/varint.rs @@ -364,12 +364,11 @@ mod tests { let mut reader = Cursor::new(&buffer); let read_value = read_unsigned_varint(&mut reader).unwrap(); - assert_eq!(value, read_value, "Round trip failed for value {}", value); + assert_eq!(value, read_value, "Round trip failed for value {value}"); assert_eq!( written, buffer.len(), - "Bytes written mismatch for value {}", - value + "Bytes written mismatch for value {value}" ); // Test with BufMut @@ -382,22 +381,19 @@ mod tests { assert_eq!( calculated_size, buffer.len(), - "Size calculation failed for value {}", - value + "Size calculation failed for value {value}" ); // Test reading from bytes let (read_value_bytes, bytes_read) = read_unsigned_varint_bytes(&buffer).unwrap(); assert_eq!( value, read_value_bytes, - "Bytes read failed for value {}", - value + "Bytes read failed for value {value}" ); assert_eq!( bytes_read, buffer.len(), - "Bytes read count mismatch for value {}", - value + "Bytes read count mismatch for value {value}" ); } } From 9ce10ce55d2978ab068b6e4f18b1b3b5f3195184 Mon Sep 17 00:00:00 2001 From: Anton Borisov <148864665+fresh-borzoni@users.noreply.github.com> Date: Sat, 17 Jan 2026 01:53:30 +0000 Subject: [PATCH 070/287] feat: Introduce python bindings row-based append API (#142) --- fluss-rust/.gitignore | 11 +- fluss-rust/bindings/python/example/example.py | 14 +- fluss-rust/bindings/python/fluss/__init__.pyi | 26 ++ fluss-rust/bindings/python/src/table.rs | 250 ++++++++++++++++-- 4 files changed, 283 insertions(+), 18 deletions(-) diff --git a/fluss-rust/.gitignore b/fluss-rust/.gitignore index c6edfb706d..8202bbca02 100644 --- a/fluss-rust/.gitignore +++ b/fluss-rust/.gitignore @@ -17,4 +17,13 @@ Cargo.lock # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ -.vscode/ \ No newline at end of file +.vscode/ + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +*.egg-info/ +dist/ +build/ \ No newline at end of file diff --git a/fluss-rust/bindings/python/example/example.py b/fluss-rust/bindings/python/example/example.py index 0b1e67d3e6..f1f20d1503 100644 --- a/fluss-rust/bindings/python/example/example.py +++ b/fluss-rust/bindings/python/example/example.py @@ -118,11 +118,21 @@ async def main(): append_writer.write_arrow_batch(pa_record_batch) print("Successfully wrote PyArrow RecordBatch") - # Test 3: Write Pandas DataFrame + # Test 3: Append single rows + print("\n--- Testing single row append ---") + # Dict input + await append_writer.append({"id": 8, "name": "Helen", "score": 93.5, "age": 26}) + print("Successfully appended row (dict)") + + # List input + await append_writer.append([9, "Ivan", 90.0, 31]) + print("Successfully appended row (list)") + + # Test 4: Write Pandas DataFrame print("\n--- Testing Pandas DataFrame write ---") df = pd.DataFrame( { - "id": [6, 7], + "id": [10, 11], "name": ["Frank", "Grace"], "score": [89.3, 94.7], "age": [29, 27], diff --git a/fluss-rust/bindings/python/fluss/__init__.pyi b/fluss-rust/bindings/python/fluss/__init__.pyi index 45652425ba..6073070c0d 100644 --- a/fluss-rust/bindings/python/fluss/__init__.pyi +++ b/fluss-rust/bindings/python/fluss/__init__.pyi @@ -68,6 +68,32 @@ class FlussTable: def __repr__(self) -> str: ... class AppendWriter: + async def append(self, row: dict | list | tuple) -> None: + """Append a single row to the table. + + Args: + row: Dictionary mapping field names to values, or + list/tuple of values in schema order + + Supported Types: + Currently supports primitive types only: + - Boolean, TinyInt, SmallInt, Int, BigInt (integers) + - Float, Double (floating point) + - String, Char (text) + - Bytes, Binary (binary data) + - Null values + + Temporal types (Date, Timestamp, Decimal) are not yet supported. + + Example: + await writer.append({'id': 1, 'name': 'Alice', 'score': 95.5}) + await writer.append([1, 'Alice', 95.5]) + + Note: + For high-throughput bulk loading, prefer write_arrow_batch(). + Use flush() to ensure all queued records are sent and acknowledged. + """ + ... def write_arrow(self, table: pa.Table) -> None: ... def write_arrow_batch(self, batch: pa.RecordBatch) -> None: ... def write_pandas(self, df: pd.DataFrame) -> None: ... diff --git a/fluss-rust/bindings/python/src/table.rs b/fluss-rust/bindings/python/src/table.rs index 6cd13c4fa1..db85c51f5a 100644 --- a/fluss-rust/bindings/python/src/table.rs +++ b/fluss-rust/bindings/python/src/table.rs @@ -49,7 +49,7 @@ impl FlussTable { let table_info = self.table_info.clone(); future_into_py(py, async move { - let fluss_table = fcore::client::FlussTable::new(&conn, metadata, table_info); + let fluss_table = fcore::client::FlussTable::new(&conn, metadata, table_info.clone()); let table_append = fluss_table .new_append() @@ -57,7 +57,7 @@ impl FlussTable { let rust_writer = table_append.create_writer(); - let py_writer = AppendWriter::from_core(rust_writer); + let py_writer = AppendWriter::from_core(rust_writer, table_info); Python::attach(|py| Py::new(py, py_writer)) }) @@ -193,13 +193,14 @@ impl FlussTable { /// Writer for appending data to a Fluss table #[pyclass] pub struct AppendWriter { - inner: fcore::client::AppendWriter, + inner: Arc, + table_info: fcore::metadata::TableInfo, } #[pymethods] impl AppendWriter { /// Write Arrow table data - pub fn write_arrow(&mut self, py: Python, table: Py) -> PyResult<()> { + pub fn write_arrow(&self, py: Python, table: Py) -> PyResult<()> { // Convert Arrow Table to batches and write each batch let batches = table.call_method0(py, "to_batches")?; let batch_list: Vec> = batches.extract(py)?; @@ -211,22 +212,40 @@ impl AppendWriter { } /// Write Arrow batch data - pub fn write_arrow_batch(&mut self, py: Python, batch: Py) -> PyResult<()> { + pub fn write_arrow_batch(&self, py: Python, batch: Py) -> PyResult<()> { // This shares the underlying Arrow buffers without copying data let batch_bound = batch.bind(py); let rust_batch: RecordBatch = FromPyArrow::from_pyarrow_bound(batch_bound) .map_err(|e| FlussError::new_err(format!("Failed to convert RecordBatch: {e}")))?; + let inner = self.inner.clone(); // Release the GIL before blocking on async operation let result = py.detach(|| { - TOKIO_RUNTIME.block_on(async { self.inner.append_arrow_batch(rust_batch).await }) + TOKIO_RUNTIME.block_on(async { inner.append_arrow_batch(rust_batch).await }) }); result.map_err(|e| FlussError::new_err(e.to_string())) } + /// Append a single row to the table + pub fn append<'py>( + &self, + py: Python<'py>, + row: &Bound<'py, PyAny>, + ) -> PyResult> { + let generic_row = python_to_generic_row(row, &self.table_info)?; + let inner = self.inner.clone(); + + future_into_py(py, async move { + inner + .append(generic_row) + .await + .map_err(|e| FlussError::new_err(e.to_string())) + }) + } + /// Write Pandas DataFrame data - pub fn write_pandas(&mut self, py: Python, df: Py) -> PyResult<()> { + pub fn write_pandas(&self, py: Python, df: Py) -> PyResult<()> { // Import pyarrow module let pyarrow = py.import("pyarrow")?; @@ -241,12 +260,16 @@ impl AppendWriter { } /// Flush any pending data - pub fn flush(&mut self) -> PyResult<()> { - TOKIO_RUNTIME.block_on(async { - self.inner - .flush() - .await - .map_err(|e| FlussError::new_err(e.to_string())) + pub fn flush(&self, py: Python) -> PyResult<()> { + let inner = self.inner.clone(); + // Release the GIL before blocking on I/O + py.detach(|| { + TOKIO_RUNTIME.block_on(async { + inner + .flush() + .await + .map_err(|e| FlussError::new_err(e.to_string())) + }) }) } @@ -257,8 +280,205 @@ impl AppendWriter { impl AppendWriter { /// Create a AppendWriter from a core append writer - pub fn from_core(append: fcore::client::AppendWriter) -> Self { - Self { inner: append } + pub fn from_core( + append: fcore::client::AppendWriter, + table_info: fcore::metadata::TableInfo, + ) -> Self { + Self { + inner: Arc::new(append), + table_info, + } + } +} + +/// Represents different input shapes for a row +#[derive(FromPyObject)] +enum RowInput<'py> { + Dict(Bound<'py, pyo3::types::PyDict>), + Tuple(Bound<'py, pyo3::types::PyTuple>), + List(Bound<'py, pyo3::types::PyList>), +} + +/// Helper function to process sequence types (list/tuple) into datums +fn process_sequence_to_datums<'a, I>( + values: I, + len: usize, + fields: &[fcore::metadata::DataField], +) -> PyResult>> +where + I: Iterator>, +{ + if len != fields.len() { + return Err(FlussError::new_err(format!( + "Expected {} values, got {}", + fields.len(), + len + ))); + } + + let mut datums = Vec::with_capacity(fields.len()); + for (i, (field, value)) in fields.iter().zip(values).enumerate() { + datums.push( + python_value_to_datum(&value, field.data_type()).map_err(|e| { + FlussError::new_err(format!("Field '{}' (index {}): {}", field.name(), i, e)) + })?, + ); + } + Ok(datums) +} + +/// Convert Python row (dict/list/tuple) to GenericRow based on schema +fn python_to_generic_row( + row: &Bound, + table_info: &fcore::metadata::TableInfo, +) -> PyResult> { + // Extract with user-friendly error message + let row_input: RowInput = row.extract().map_err(|_| { + let type_name = row + .get_type() + .name() + .map(|n| n.to_string()) + .unwrap_or_else(|_| "unknown".to_string()); + FlussError::new_err(format!( + "Row must be a dict, list, or tuple; got {}", + type_name + )) + })?; + let schema = table_info.row_type(); + let fields = schema.fields(); + + let datums = match row_input { + RowInput::Dict(dict) => { + // Strict: reject unknown keys (and also reject non-str keys nicely) + for (k, _) in dict.iter() { + let key_str = k.extract::<&str>().map_err(|_| { + let key_type = k + .get_type() + .name() + .map(|n| n.to_string()) + .unwrap_or_else(|_| "unknown".to_string()); + FlussError::new_err(format!("Row dict keys must be strings; got {}", key_type)) + })?; + + if fields.iter().all(|f| f.name() != key_str) { + let expected = fields + .iter() + .map(|f| f.name()) + .collect::>() + .join(", "); + return Err(FlussError::new_err(format!( + "Unknown field '{}'. Expected fields: {}", + key_str, expected + ))); + } + } + + let mut datums = Vec::with_capacity(fields.len()); + for field in fields { + let value = dict.get_item(field.name())?.ok_or_else(|| { + FlussError::new_err(format!("Missing field: {}", field.name())) + })?; + datums.push( + python_value_to_datum(&value, field.data_type()).map_err(|e| { + FlussError::new_err(format!("Field '{}': {}", field.name(), e)) + })?, + ); + } + datums + } + + RowInput::List(list) => process_sequence_to_datums(list.iter(), list.len(), fields)?, + + RowInput::Tuple(tuple) => process_sequence_to_datums(tuple.iter(), tuple.len(), fields)?, + }; + + Ok(fcore::row::GenericRow { values: datums }) +} + +/// Convert Python value to Datum based on data type +fn python_value_to_datum( + value: &Bound, + data_type: &fcore::metadata::DataType, +) -> PyResult> { + use fcore::row::{Datum, F32, F64}; + + if value.is_none() { + return Ok(Datum::Null); + } + + match data_type { + fcore::metadata::DataType::Boolean(_) => { + let v: bool = value.extract()?; + Ok(Datum::Bool(v)) + } + fcore::metadata::DataType::TinyInt(_) => { + // Strict type checking: reject bool for int columns + if value.is_instance_of::() { + return Err(FlussError::new_err( + "Expected int for TinyInt column, got bool. Use 0 or 1 explicitly.".to_string(), + )); + } + let v: i8 = value.extract()?; + Ok(Datum::Int8(v)) + } + fcore::metadata::DataType::SmallInt(_) => { + if value.is_instance_of::() { + return Err(FlussError::new_err( + "Expected int for SmallInt column, got bool. Use 0 or 1 explicitly." + .to_string(), + )); + } + let v: i16 = value.extract()?; + Ok(Datum::Int16(v)) + } + fcore::metadata::DataType::Int(_) => { + if value.is_instance_of::() { + return Err(FlussError::new_err( + "Expected int for Int column, got bool. Use 0 or 1 explicitly.".to_string(), + )); + } + let v: i32 = value.extract()?; + Ok(Datum::Int32(v)) + } + fcore::metadata::DataType::BigInt(_) => { + if value.is_instance_of::() { + return Err(FlussError::new_err( + "Expected int for BigInt column, got bool. Use 0 or 1 explicitly.".to_string(), + )); + } + let v: i64 = value.extract()?; + Ok(Datum::Int64(v)) + } + fcore::metadata::DataType::Float(_) => { + let v: f32 = value.extract()?; + Ok(Datum::Float32(F32::from(v))) + } + fcore::metadata::DataType::Double(_) => { + let v: f64 = value.extract()?; + Ok(Datum::Float64(F64::from(v))) + } + fcore::metadata::DataType::String(_) | fcore::metadata::DataType::Char(_) => { + let v: String = value.extract()?; + Ok(v.into()) + } + fcore::metadata::DataType::Bytes(_) | fcore::metadata::DataType::Binary(_) => { + // Efficient extraction: downcast to specific type and use bulk copy. + // PyBytes::as_bytes() and PyByteArray::to_vec() are O(n) bulk copies of the underlying data. + if let Ok(bytes) = value.downcast::() { + Ok(bytes.as_bytes().to_vec().into()) + } else if let Ok(bytearray) = value.downcast::() { + Ok(bytearray.to_vec().into()) + } else { + Err(FlussError::new_err(format!( + "Expected bytes or bytearray, got {}", + value.get_type().name()? + ))) + } + } + _ => Err(FlussError::new_err(format!( + "Unsupported data type for row-level operations: {:?}", + data_type + ))), } } From a4409833d235dfec67b3004136bf0ffa155b543c Mon Sep 17 00:00:00 2001 From: AlexZhao Date: Sat, 17 Jan 2026 14:33:57 +0800 Subject: [PATCH 071/287] chore: Improve read path error handling logic (#143) --- .../src/client/table/log_fetch_buffer.rs | 378 ++++++++++- .../fluss/src/client/table/remote_log.rs | 2 +- .../crates/fluss/src/client/table/scanner.rs | 628 ++++++++++++++---- .../crates/fluss/src/client/write/sender.rs | 2 +- fluss-rust/crates/fluss/src/error.rs | 5 + fluss-rust/crates/fluss/src/record/arrow.rs | 67 +- fluss-rust/crates/fluss/src/record/mod.rs | 62 ++ fluss-rust/crates/fluss/src/row/column.rs | 64 ++ .../fluss/src/rpc/message/list_offsets.rs | 52 +- fluss-rust/crates/fluss/src/util/mod.rs | 54 ++ 10 files changed, 1122 insertions(+), 192 deletions(-) diff --git a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs index c55c994b03..fb6981f4b5 100644 --- a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs +++ b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs @@ -18,7 +18,7 @@ use arrow::array::RecordBatch; use parking_lot::Mutex; -use crate::error::Result; +use crate::error::{ApiError, Error, Result}; use crate::metadata::TableBucket; use crate::record::{ LogRecordBatch, LogRecordIterator, LogRecordsBatches, ReadContext, ScanRecord, @@ -29,12 +29,38 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; use tokio::sync::Notify; +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum FetchErrorAction { + Ignore, + LogOffsetOutOfRange, + Authorization, + CorruptMessage, + Unexpected, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum FetchErrorLogLevel { + Debug, + Warn, +} + +#[derive(Clone, Debug)] +pub(crate) struct FetchErrorContext { + pub(crate) action: FetchErrorAction, + pub(crate) log_level: FetchErrorLogLevel, + pub(crate) log_message: String, +} + /// Represents a completed fetch that can be consumed pub trait CompletedFetch: Send + Sync { fn table_bucket(&self) -> &TableBucket; + fn api_error(&self) -> Option<&ApiError>; + fn fetch_error_context(&self) -> Option<&FetchErrorContext>; + fn take_error(&mut self) -> Option; fn fetch_records(&mut self, max_records: usize) -> Result>; fn fetch_batches(&mut self, max_batches: usize) -> Result>; fn is_consumed(&self) -> bool; + fn records_read(&self) -> usize; fn drain(&mut self); fn size_in_bytes(&self) -> usize; fn high_watermark(&self) -> i64; @@ -52,6 +78,7 @@ pub trait PendingFetch: Send + Sync { /// Thread-safe buffer for completed fetches pub struct LogFetchBuffer { + read_context: ReadContext, completed_fetches: Mutex>>, pending_fetches: Mutex>>>, next_in_line_fetch: Mutex>>, @@ -60,8 +87,9 @@ pub struct LogFetchBuffer { } impl LogFetchBuffer { - pub fn new() -> Self { + pub fn new(read_context: ReadContext) -> Self { Self { + read_context, completed_fetches: Mutex::new(VecDeque::new()), pending_fetches: Mutex::new(HashMap::new()), next_in_line_fetch: Mutex::new(None), @@ -75,26 +103,28 @@ impl LogFetchBuffer { self.completed_fetches.lock().is_empty() } - /// Wait for the buffer to become non-empty, with timeout - /// Returns true if data became available, false if timeout - pub async fn await_not_empty(&self, timeout: Duration) -> bool { + /// Wait for the buffer to become non-empty, with timeout. + /// Returns true if data became available, false if timeout. + pub async fn await_not_empty(&self, timeout: Duration) -> Result { let deadline = std::time::Instant::now() + timeout; loop { // Check if buffer is not empty if !self.is_empty() { - return true; + return Ok(true); } // Check if woken up if self.woken_up.swap(false, Ordering::Acquire) { - return true; + return Err(Error::WakeupError { + message: "The await operation was interrupted by wakeup.".to_string(), + }); } // Check if timeout let now = std::time::Instant::now(); if now >= deadline { - return false; + return Ok(false); } // Wait for notification with remaining time @@ -102,7 +132,7 @@ impl LogFetchBuffer { let notified = self.not_empty_notify.notified(); tokio::select! { _ = tokio::time::sleep(remaining) => { - return false; // Timeout + return Ok(false); // Timeout } _ = notified => { // Got notification, check again @@ -119,6 +149,26 @@ impl LogFetchBuffer { self.not_empty_notify.notify_waiters(); } + pub(crate) fn add_api_error( + &self, + table_bucket: TableBucket, + api_error: ApiError, + fetch_error_context: FetchErrorContext, + fetch_offset: i64, + ) { + let error_fetch = DefaultCompletedFetch::from_api_error( + table_bucket, + api_error, + fetch_error_context, + fetch_offset, + self.read_context.clone(), + ); + self.completed_fetches + .lock() + .push_back(Box::new(error_fetch)); + self.not_empty_notify.notify_waiters(); + } + /// Add a pending fetch to the buffer pub fn pend(&self, pending_fetch: Box) { let table_bucket = pending_fetch.table_bucket().clone(); @@ -136,6 +186,7 @@ impl LogFetchBuffer { // holding both locks simultaneously. let mut completed_to_push: Vec> = Vec::new(); let mut has_completed = false; + let mut pending_error: Option = None; { let mut pending_map = self.pending_fetches.lock(); if let Some(pendings) = pending_map.get_mut(table_bucket) { @@ -148,8 +199,9 @@ impl LogFetchBuffer { has_completed = true; } Err(e) => { - // todo: handle exception? - log::error!("Error when completing: {e}"); + pending_error = Some(e); + has_completed = true; + break; } } } else { @@ -162,11 +214,22 @@ impl LogFetchBuffer { } } + if let Some(error) = pending_error { + let error_fetch = DefaultCompletedFetch::from_error( + table_bucket.clone(), + error, + -1, + self.read_context.clone(), + ); + completed_to_push.push(Box::new(error_fetch)); + } + if !completed_to_push.is_empty() { let mut completed_queue = self.completed_fetches.lock(); for completed in completed_to_push { completed_queue.push_back(completed); } + has_completed = true; } if has_completed { @@ -236,12 +299,6 @@ impl LogFetchBuffer { } } -impl Default for LogFetchBuffer { - fn default() -> Self { - Self::new() - } -} - /// A wrapper that makes a completed fetch look like a pending fetch struct CompletedPendingFetch { completed_fetch: Box, @@ -270,6 +327,9 @@ impl PendingFetch for CompletedPendingFetch { /// Default implementation of CompletedFetch for in-memory log records pub struct DefaultCompletedFetch { table_bucket: TableBucket, + api_error: Option, + fetch_error_context: Option, + error: Option, log_record_batch: LogRecordsBatches, read_context: ReadContext, next_fetch_offset: i64, @@ -280,6 +340,9 @@ pub struct DefaultCompletedFetch { records_read: usize, current_record_iterator: Option, current_record_batch: Option, + last_record: Option, + cached_record_error: Option, + corrupt_last_record: bool, } impl DefaultCompletedFetch { @@ -290,9 +353,12 @@ impl DefaultCompletedFetch { read_context: ReadContext, fetch_offset: i64, high_watermark: i64, - ) -> Result { - Ok(Self { + ) -> Self { + Self { table_bucket, + api_error: None, + fetch_error_context: None, + error: None, log_record_batch, read_context, next_fetch_offset: fetch_offset, @@ -303,7 +369,65 @@ impl DefaultCompletedFetch { records_read: 0, current_record_iterator: None, current_record_batch: None, - }) + last_record: None, + cached_record_error: None, + corrupt_last_record: false, + } + } + + pub(crate) fn from_error( + table_bucket: TableBucket, + error: Error, + fetch_offset: i64, + read_context: ReadContext, + ) -> Self { + Self { + table_bucket, + api_error: None, + fetch_error_context: None, + error: Some(error), + log_record_batch: LogRecordsBatches::new(Vec::new()), + read_context, + next_fetch_offset: fetch_offset, + high_watermark: -1, + size_in_bytes: 0, + consumed: false, + initialized: false, + records_read: 0, + current_record_iterator: None, + current_record_batch: None, + last_record: None, + cached_record_error: None, + corrupt_last_record: false, + } + } + + pub(crate) fn from_api_error( + table_bucket: TableBucket, + api_error: ApiError, + fetch_error_context: FetchErrorContext, + fetch_offset: i64, + read_context: ReadContext, + ) -> Self { + Self { + table_bucket, + api_error: Some(api_error), + fetch_error_context: Some(fetch_error_context), + error: None, + log_record_batch: LogRecordsBatches::new(Vec::new()), + read_context, + next_fetch_offset: fetch_offset, + high_watermark: -1, + size_in_bytes: 0, + consumed: false, + initialized: false, + records_read: 0, + current_record_iterator: None, + current_record_batch: None, + last_record: None, + cached_record_error: None, + corrupt_last_record: false, + } } /// Get the next fetched record, handling batch iteration and record skipping @@ -330,6 +454,19 @@ impl DefaultCompletedFetch { } } + fn fetch_error(&self) -> Error { + let mut message = format!( + "Received exception when fetching the next record from {table_bucket}. If needed, please back to past the record to continue scanning.", + table_bucket = self.table_bucket + ); + if let Some(cause) = self.cached_record_error.as_deref() { + message.push_str(&format!(" Cause: {cause}")); + } + Error::UnexpectedError { + message, + source: None, + } + } /// Get the next batch directly without row iteration fn next_fetched_batch(&mut self) -> Result> { loop { @@ -368,8 +505,36 @@ impl CompletedFetch for DefaultCompletedFetch { &self.table_bucket } + fn api_error(&self) -> Option<&ApiError> { + self.api_error.as_ref() + } + + fn fetch_error_context(&self) -> Option<&FetchErrorContext> { + self.fetch_error_context.as_ref() + } + + fn take_error(&mut self) -> Option { + self.error.take() + } + fn fetch_records(&mut self, max_records: usize) -> Result> { - // todo: handle corrupt_last_record + if let Some(error) = self.error.take() { + return Err(error); + } + + if let Some(api_error) = self.api_error.as_ref() { + return Err(Error::FlussAPIError { + api_error: ApiError { + code: api_error.code, + message: api_error.message.clone(), + }, + }); + } + + if self.corrupt_last_record { + return Err(self.fetch_error()); + } + if self.consumed { return Ok(Vec::new()); } @@ -377,19 +542,53 @@ impl CompletedFetch for DefaultCompletedFetch { let mut scan_records = Vec::new(); for _ in 0..max_records { - if let Some(record) = self.next_fetched_record()? { - self.next_fetch_offset = record.offset() + 1; - self.records_read += 1; - scan_records.push(record); - } else { - break; + if self.cached_record_error.is_none() { + self.corrupt_last_record = true; + match self.next_fetched_record() { + Ok(Some(record)) => { + self.corrupt_last_record = false; + self.last_record = Some(record); + } + Ok(None) => { + self.corrupt_last_record = false; + self.last_record = None; + } + Err(e) => { + self.cached_record_error = Some(e.to_string()); + } + } } + + let Some(record) = self.last_record.take() else { + break; + }; + + self.next_fetch_offset = record.offset() + 1; + self.records_read += 1; + scan_records.push(record); + } + + if self.cached_record_error.is_some() && scan_records.is_empty() { + return Err(self.fetch_error()); } Ok(scan_records) } fn fetch_batches(&mut self, max_batches: usize) -> Result> { + if let Some(error) = self.error.take() { + return Err(error); + } + + if let Some(api_error) = self.api_error.as_ref() { + return Err(Error::FlussAPIError { + api_error: ApiError { + code: api_error.code, + message: api_error.message.clone(), + }, + }); + } + if self.consumed { return Ok(Vec::new()); } @@ -410,8 +609,18 @@ impl CompletedFetch for DefaultCompletedFetch { self.consumed } + fn records_read(&self) -> usize { + self.records_read + } + fn drain(&mut self) { self.consumed = true; + self.api_error = None; + self.fetch_error_context = None; + self.error = None; + self.cached_record_error = None; + self.corrupt_last_record = false; + self.last_record = None; } fn size_in_bytes(&self) -> usize { @@ -434,3 +643,118 @@ impl CompletedFetch for DefaultCompletedFetch { self.next_fetch_offset } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::WriteRecord; + use crate::compression::{ + ArrowCompressionInfo, ArrowCompressionType, DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }; + use crate::metadata::{DataField, DataTypes, TablePath}; + use crate::record::{MemoryLogRecordsArrowBuilder, ReadContext, to_arrow_schema}; + use crate::row::GenericRow; + use std::sync::Arc; + use std::time::Duration; + + fn test_read_context() -> ReadContext { + let row_type = DataTypes::row(vec![DataField::new( + "id".to_string(), + DataTypes::int(), + None, + )]); + ReadContext::new(to_arrow_schema(&row_type), false) + } + + struct ErrorPendingFetch { + table_bucket: TableBucket, + } + + impl PendingFetch for ErrorPendingFetch { + fn table_bucket(&self) -> &TableBucket { + &self.table_bucket + } + + fn is_completed(&self) -> bool { + true + } + + fn to_completed_fetch(self: Box) -> Result> { + Err(Error::UnexpectedError { + message: "pending fetch failure".to_string(), + source: None, + }) + } + } + + #[tokio::test] + async fn await_not_empty_returns_wakeup_error() { + let buffer = LogFetchBuffer::new(test_read_context()); + buffer.wakeup(); + + let result = buffer.await_not_empty(Duration::from_millis(10)).await; + assert!(matches!(result, Err(Error::WakeupError { .. }))); + } + + #[tokio::test] + async fn await_not_empty_returns_pending_error() { + let buffer = LogFetchBuffer::new(test_read_context()); + let table_bucket = TableBucket::new(1, 0); + buffer.pend(Box::new(ErrorPendingFetch { + table_bucket: table_bucket.clone(), + })); + buffer.try_complete(&table_bucket); + + let result = buffer.await_not_empty(Duration::from_millis(10)).await; + assert!(matches!(result, Ok(true))); + + let mut completed = buffer.poll().expect("completed fetch"); + assert!(completed.take_error().is_some()); + } + + #[test] + fn default_completed_fetch_reads_records() -> Result<()> { + let row_type = DataTypes::row(vec![ + DataField::new("id".to_string(), DataTypes::int(), None), + DataField::new("name".to_string(), DataTypes::string(), None), + ]); + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + + let mut builder = MemoryLogRecordsArrowBuilder::new( + 1, + &row_type, + false, + ArrowCompressionInfo { + compression_type: ArrowCompressionType::None, + compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }, + ); + + let mut row = GenericRow::new(); + row.set_field(0, 1_i32); + row.set_field(1, "alice"); + let record = WriteRecord::new(table_path, row); + builder.append(&record)?; + + let data = builder.build()?; + let log_records = LogRecordsBatches::new(data.clone()); + let read_context = ReadContext::new(to_arrow_schema(&row_type), false); + let mut fetch = DefaultCompletedFetch::new( + TableBucket::new(1, 0), + log_records, + data.len(), + read_context, + 0, + 0, + ); + + let records = fetch.fetch_records(10)?; + assert_eq!(records.len(), 1); + assert_eq!(records[0].offset(), 0); + + let empty = fetch.fetch_records(10)?; + assert!(empty.is_empty()); + + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/remote_log.rs b/fluss-rust/crates/fluss/src/client/table/remote_log.rs index d9abd19c29..0142515783 100644 --- a/fluss-rust/crates/fluss/src/client/table/remote_log.rs +++ b/fluss-rust/crates/fluss/src/client/table/remote_log.rs @@ -409,7 +409,7 @@ impl PendingFetch for RemotePendingFetch { self.read_context, self.fetch_offset, self.high_watermark, - )?; + ); Ok(Box::new(completed_fetch)) } diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index 7d22324d6b..3e7d61ff99 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -17,7 +17,7 @@ use arrow::array::RecordBatch; use arrow_schema::SchemaRef; -use log::{debug, error, warn}; +use log::{debug, warn}; use parking_lot::{Mutex, RwLock}; use std::collections::{HashMap, HashSet}; use std::slice::from_ref; @@ -29,16 +29,17 @@ use crate::client::connection::FlussConnection; use crate::client::credentials::CredentialsCache; use crate::client::metadata::Metadata; use crate::client::table::log_fetch_buffer::{ - CompletedFetch, DefaultCompletedFetch, LogFetchBuffer, + CompletedFetch, DefaultCompletedFetch, FetchErrorAction, FetchErrorContext, FetchErrorLogLevel, + LogFetchBuffer, }; use crate::client::table::remote_log::{ RemoteLogDownloader, RemoteLogFetchInfo, RemotePendingFetch, }; -use crate::error::{Error, Result, RpcError}; +use crate::error::{ApiError, Error, FlussError, Result}; use crate::metadata::{TableBucket, TableInfo, TablePath}; -use crate::proto::{FetchLogRequest, PbFetchLogReqForBucket, PbFetchLogReqForTable}; +use crate::proto::{ErrorResponse, FetchLogRequest, PbFetchLogReqForBucket, PbFetchLogReqForTable}; use crate::record::{LogRecordsBatches, ReadContext, ScanRecord, ScanRecords, to_arrow_schema}; -use crate::rpc::{RpcClient, message}; +use crate::rpc::{RpcClient, RpcError, message}; use crate::util::FairBucketStatusMap; const LOG_FETCH_MAX_BYTES: i32 = 16 * 1024 * 1024; @@ -318,7 +319,7 @@ impl LogScannerInner { .log_fetcher .log_fetch_buffer .await_not_empty(remaining) - .await; + .await?; if !has_data { // Timeout while waiting @@ -396,7 +397,7 @@ impl LogScannerInner { .log_fetcher .log_fetch_buffer .await_not_empty(remaining) - .await; + .await?; if !has_data { return Ok(Vec::new()); @@ -448,6 +449,8 @@ impl RecordBatchLogScanner { struct LogFetcher { conns: Arc, metadata: Arc, + table_path: TablePath, + is_partitioned: bool, log_scanner_status: Arc, read_context: ReadContext, remote_read_context: ReadContext, @@ -457,8 +460,6 @@ struct LogFetcher { credentials_cache: Arc, log_fetch_buffer: Arc, nodes_with_pending_fetch_requests: Arc>>, - table_path: TablePath, - is_partitioned: bool, } impl LogFetcher { @@ -471,24 +472,25 @@ impl LogFetcher { ) -> Result { let full_arrow_schema = to_arrow_schema(table_info.get_row_type()); let read_context = - Self::create_read_context(full_arrow_schema.clone(), projected_fields.clone(), false); + Self::create_read_context(full_arrow_schema.clone(), projected_fields.clone(), false)?; let remote_read_context = - Self::create_read_context(full_arrow_schema, projected_fields.clone(), true); + Self::create_read_context(full_arrow_schema, projected_fields.clone(), true)?; let tmp_dir = TempDir::with_prefix("fluss-remote-logs")?; + let log_fetch_buffer = Arc::new(LogFetchBuffer::new(read_context.clone())); Ok(LogFetcher { conns: conns.clone(), metadata: metadata.clone(), + table_path: table_info.table_path.clone(), + is_partitioned: table_info.is_partitioned(), log_scanner_status, read_context, remote_read_context, remote_log_downloader: Arc::new(RemoteLogDownloader::new(tmp_dir)?), credentials_cache: Arc::new(CredentialsCache::new(conns.clone(), metadata.clone())), - log_fetch_buffer: Arc::new(LogFetchBuffer::new()), + log_fetch_buffer, nodes_with_pending_fetch_requests: Arc::new(Mutex::new(HashSet::new())), - table_path: table_info.table_path.clone(), - is_partitioned: table_info.is_partitioned(), }) } @@ -496,23 +498,79 @@ impl LogFetcher { full_arrow_schema: SchemaRef, projected_fields: Option>, is_from_remote: bool, - ) -> ReadContext { + ) -> Result { match projected_fields { - None => ReadContext::new(full_arrow_schema, is_from_remote), + None => Ok(ReadContext::new(full_arrow_schema, is_from_remote)), Some(fields) => { ReadContext::with_projection_pushdown(full_arrow_schema, fields, is_from_remote) } } } - async fn check_and_update_metadata(&self) -> Result<()> { - if self.is_partitioned { - // TODO: Implement partition-aware metadata refresh for buckets whose leaders are unknown. - // The implementation will likely need to collect partition IDs for such buckets and - // perform targeted metadata updates. Until then, we avoid computing unused partition_ids. - return Ok(()); + fn describe_fetch_error( + error: FlussError, + table_bucket: &TableBucket, + fetch_offset: i64, + error_message: &str, + ) -> FetchErrorContext { + match error { + FlussError::NotLeaderOrFollower + | FlussError::LogStorageException + | FlussError::KvStorageException + | FlussError::StorageException + | FlussError::FencedLeaderEpochException => FetchErrorContext { + action: FetchErrorAction::Ignore, + log_level: FetchErrorLogLevel::Debug, + log_message: format!( + "Error in fetch for bucket {table_bucket}: {error:?}: {error_message}" + ), + }, + FlussError::UnknownTableOrBucketException => FetchErrorContext { + action: FetchErrorAction::Ignore, + log_level: FetchErrorLogLevel::Warn, + log_message: format!( + "Received unknown table or bucket error in fetch for bucket {table_bucket}" + ), + }, + FlussError::LogOffsetOutOfRangeException => FetchErrorContext { + action: FetchErrorAction::LogOffsetOutOfRange, + log_level: FetchErrorLogLevel::Debug, + log_message: format!( + "The fetching offset {fetch_offset} is out of range for bucket {table_bucket}: {error_message}" + ), + }, + FlussError::AuthorizationException => FetchErrorContext { + action: FetchErrorAction::Authorization, + log_level: FetchErrorLogLevel::Debug, + log_message: format!( + "Authorization error while fetching offset {fetch_offset} for bucket {table_bucket}: {error_message}" + ), + }, + FlussError::UnknownServerError => FetchErrorContext { + action: FetchErrorAction::Ignore, + log_level: FetchErrorLogLevel::Warn, + log_message: format!( + "Unknown server error while fetching offset {fetch_offset} for bucket {table_bucket}: {error_message}" + ), + }, + FlussError::CorruptMessage => FetchErrorContext { + action: FetchErrorAction::CorruptMessage, + log_level: FetchErrorLogLevel::Debug, + log_message: format!( + "Encountered corrupt message when fetching offset {fetch_offset} for bucket {table_bucket}: {error_message}" + ), + }, + _ => FetchErrorContext { + action: FetchErrorAction::Unexpected, + log_level: FetchErrorLogLevel::Debug, + log_message: format!( + "Unexpected error code {error:?} while fetching at offset {fetch_offset} from bucket {table_bucket}: {error_message}" + ), + }, } + } + async fn check_and_update_metadata(&self) -> Result<()> { let need_update = self .fetchable_buckets() .iter() @@ -522,6 +580,26 @@ impl LogFetcher { return Ok(()); } + if self.is_partitioned { + // Fallback to full table metadata refresh until partition-aware updates are available. + self.metadata + .update_tables_metadata(&HashSet::from([&self.table_path])) + .await + .or_else(|e| { + if let Error::RpcError { source, .. } = &e + && matches!(source, RpcError::ConnectionError(_) | RpcError::Poisoned(_)) + { + warn!( + "Retrying after encountering error while updating table metadata: {e}" + ); + Ok(()) + } else { + Err(e) + } + })?; + return Ok(()); + } + // TODO: Handle PartitionNotExist error self.metadata .update_tables_metadata(&HashSet::from([&self.table_path])) @@ -561,7 +639,6 @@ impl LogFetcher { let creds_cache = self.credentials_cache.clone(); let nodes_with_pending = self.nodes_with_pending_fetch_requests.clone(); let metadata = self.metadata.clone(); - // Spawn async task to handle the fetch request // Note: These tasks are not explicitly tracked or cancelled when LogFetcher is dropped. // This is acceptable because: @@ -607,7 +684,7 @@ impl LogFetcher { } }; - if let Err(e) = Self::handle_fetch_response( + Self::handle_fetch_response( fetch_response, &log_fetch_buffer, &log_scanner_status, @@ -616,10 +693,7 @@ impl LogFetcher { &remote_log_downloader, &creds_cache, ) - .await - { - error!("Fail to handle fetch response: {e:?}"); - } + .await; }); } @@ -644,7 +718,7 @@ impl LogFetcher { remote_read_context: &ReadContext, remote_log_downloader: &Arc, credentials_cache: &Arc, - ) -> Result<()> { + ) { for pb_fetch_log_resp in fetch_response.tables_resp { let table_id = pb_fetch_log_resp.table_id; let fetch_log_for_buckets = pb_fetch_log_resp.buckets_resp; @@ -661,11 +735,45 @@ impl LogFetcher { continue; }; + if let Some(error_code) = fetch_log_for_bucket.error_code + && error_code != FlussError::None.code() + { + let api_error: ApiError = ErrorResponse { + error_code, + error_message: fetch_log_for_bucket.error_message.clone(), + } + .into(); + + let error = FlussError::for_code(error_code); + let error_context = Self::describe_fetch_error( + error, + &table_bucket, + fetch_offset, + api_error.message.as_str(), + ); + log_scanner_status.move_bucket_to_end(table_bucket.clone()); + match error_context.log_level { + FetchErrorLogLevel::Debug => { + debug!("{}", error_context.log_message); + } + FetchErrorLogLevel::Warn => { + warn!("{}", error_context.log_message); + } + } + log_fetch_buffer.add_api_error( + table_bucket.clone(), + api_error, + error_context, + fetch_offset, + ); + continue; + } + // Check if this is a remote log fetch if let Some(ref remote_log_fetch_info) = fetch_log_for_bucket.remote_log_fetch_info { // set remote fs props - let remote_fs_props = credentials_cache.get_or_refresh().await?; + let remote_fs_props = credentials_cache.get_or_refresh().await.unwrap(); remote_log_downloader.set_remote_fs_props(remote_fs_props); let remote_fetch_info = @@ -688,26 +796,18 @@ impl LogFetcher { let size_in_bytes = records.len(); let log_record_batch = LogRecordsBatches::new(records); - match DefaultCompletedFetch::new( + let completed_fetch = DefaultCompletedFetch::new( table_bucket.clone(), log_record_batch, size_in_bytes, read_context.clone(), fetch_offset, high_watermark, - ) { - Ok(completed_fetch) => { - log_fetch_buffer.add(Box::new(completed_fetch)); - } - Err(e) => { - // todo: handle error - log::warn!("Failed to create completed fetch: {e:?}"); - } - } + ); + log_fetch_buffer.add(Box::new(completed_fetch)); } } } - Ok(()) } fn pending_remote_fetches( @@ -763,69 +863,91 @@ impl LogFetcher { let mut result: HashMap> = HashMap::new(); let mut records_remaining = MAX_POLL_RECORDS; - while records_remaining > 0 { - // Get the next in line fetch, or get a new one from buffer - let next_in_line = self.log_fetch_buffer.next_in_line_fetch(); - - if next_in_line.is_none() || next_in_line.as_ref().unwrap().is_consumed() { - // Get a new fetch from buffer - if let Some(completed_fetch) = self.log_fetch_buffer.poll() { - // Initialize the fetch if not already initialized - if !completed_fetch.is_initialized() { - let size_in_bytes = completed_fetch.size_in_bytes(); - match self.initialize_fetch(completed_fetch) { - Ok(initialized) => { - self.log_fetch_buffer.set_next_in_line_fetch(initialized); - continue; + let collect_result: Result<()> = { + while records_remaining > 0 { + // Get the next in line fetch, or get a new one from buffer + let next_in_line = self.log_fetch_buffer.next_in_line_fetch(); + + if next_in_line.is_none() || next_in_line.as_ref().unwrap().is_consumed() { + // Get a new fetch from buffer + if let Some(completed_fetch) = self.log_fetch_buffer.poll() { + // Initialize the fetch if not already initialized + if !completed_fetch.is_initialized() { + let size_in_bytes = completed_fetch.size_in_bytes(); + match self.initialize_fetch(completed_fetch) { + Ok(initialized) => { + self.log_fetch_buffer.set_next_in_line_fetch(initialized); + continue; + } + Err(e) => { + // Remove a completedFetch upon a parse with exception if + // (1) it contains no records, and + // (2) there are no fetched records with actual content preceding this + // exception. + if result.is_empty() && size_in_bytes == 0 { + // todo: do we need to consider it like java ? + // self.log_fetch_buffer.poll(); + } + return Err(e); + } } + } else { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(completed_fetch)); + } + // Note: poll() already removed the fetch from buffer, so no need to call poll() + } else { + // No more fetches available + break; + } + } else { + // Fetch records from next_in_line + if let Some(mut next_fetch) = next_in_line { + let records = match self + .fetch_records_from_fetch(&mut next_fetch, records_remaining) + { + Ok(records) => records, Err(e) => { - // Remove a completedFetch upon a parse with exception if - // (1) it contains no records, and - // (2) there are no fetched records with actual content preceding this - // exception. - if result.is_empty() && size_in_bytes == 0 { - // todo: do we need to consider it like java ? - // self.log_fetch_buffer.poll(); + if !next_fetch.is_consumed() { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(next_fetch)); } return Err(e); } + }; + + if !records.is_empty() { + let table_bucket = next_fetch.table_bucket().clone(); + // Merge with existing records for this bucket + let existing = result.entry(table_bucket).or_default(); + let records_count = records.len(); + existing.extend(records); + + records_remaining = records_remaining.saturating_sub(records_count); } - } else { - self.log_fetch_buffer - .set_next_in_line_fetch(Some(completed_fetch)); + + // If the fetch is not fully consumed, put it back for the next round + if !next_fetch.is_consumed() { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(next_fetch)); + } + // If consumed, next_fetch will be dropped here (which is correct) } - // Note: poll() already removed the fetch from buffer, so no need to call poll() - } else { - // No more fetches available - break; } - } else { - // Fetch records from next_in_line - if let Some(mut next_fetch) = next_in_line { - let records = - self.fetch_records_from_fetch(&mut next_fetch, records_remaining)?; - - if !records.is_empty() { - let table_bucket = next_fetch.table_bucket().clone(); - // Merge with existing records for this bucket - let existing = result.entry(table_bucket).or_default(); - let records_count = records.len(); - existing.extend(records); - - records_remaining = records_remaining.saturating_sub(records_count); - } + } + Ok(()) + }; - // If the fetch is not fully consumed, put it back for the next round - if !next_fetch.is_consumed() { - self.log_fetch_buffer - .set_next_in_line_fetch(Some(next_fetch)); - } - // If consumed, next_fetch will be dropped here (which is correct) + match collect_result { + Ok(()) => Ok(result), + Err(e) => { + if result.is_empty() { + Err(e) + } else { + Ok(result) } } } - - Ok(result) } /// Initialize a completed fetch, checking offset match and updating high watermark @@ -833,12 +955,63 @@ impl LogFetcher { &self, mut completed_fetch: Box, ) -> Result>> { - // todo: handle error in initialize fetch - let table_bucket = completed_fetch.table_bucket(); + if let Some(error) = completed_fetch.take_error() { + return Err(error); + } + + let table_bucket = completed_fetch.table_bucket().clone(); let fetch_offset = completed_fetch.next_fetch_offset(); + if let Some(api_error) = completed_fetch.api_error() { + let error = FlussError::for_code(api_error.code); + let error_message = api_error.message.as_str(); + self.log_scanner_status + .move_bucket_to_end(table_bucket.clone()); + let action = completed_fetch + .fetch_error_context() + .map(|context| context.action) + .unwrap_or(FetchErrorAction::Unexpected); + match action { + FetchErrorAction::Ignore => { + return Ok(None); + } + FetchErrorAction::LogOffsetOutOfRange => { + return Err(Error::UnexpectedError { + message: format!( + "The fetching offset {fetch_offset} is out of range: {error_message}" + ), + source: None, + }); + } + FetchErrorAction::Authorization => { + return Err(Error::FlussAPIError { + api_error: ApiError { + code: api_error.code, + message: api_error.message.to_string(), + }, + }); + } + FetchErrorAction::CorruptMessage => { + return Err(Error::UnexpectedError { + message: format!( + "Encountered corrupt message when fetching offset {fetch_offset} for bucket {table_bucket}: {error_message}" + ), + source: None, + }); + } + FetchErrorAction::Unexpected => { + return Err(Error::UnexpectedError { + message: format!( + "Unexpected error code {error:?} while fetching at offset {fetch_offset} from bucket {table_bucket}: {error_message}" + ), + source: None, + }); + } + } + } + // Check if bucket is still subscribed - let Some(current_offset) = self.log_scanner_status.get_bucket_offset(table_bucket) else { + let Some(current_offset) = self.log_scanner_status.get_bucket_offset(&table_bucket) else { warn!( "Discarding stale fetch response for bucket {table_bucket:?} since the bucket has been unsubscribed" ); @@ -857,7 +1030,7 @@ impl LogFetcher { let high_watermark = completed_fetch.high_watermark(); if high_watermark >= 0 { self.log_scanner_status - .update_high_watermark(table_bucket, high_watermark); + .update_high_watermark(&table_bucket, high_watermark); } completed_fetch.set_initialized(); @@ -894,6 +1067,11 @@ impl LogFetcher { .update_offset(&table_bucket, next_fetch_offset); } + if next_in_line_fetch.is_consumed() && next_in_line_fetch.records_read() > 0 { + self.log_scanner_status + .move_bucket_to_end(table_bucket.clone()); + } + Ok(records) } else { // These records aren't next in line, ignore them @@ -915,58 +1093,70 @@ impl LogFetcher { let mut batches_remaining = MAX_BATCHES; let mut bytes_consumed: usize = 0; - while batches_remaining > 0 && bytes_consumed < MAX_BYTES { - let next_in_line = self.log_fetch_buffer.next_in_line_fetch(); + let collect_result: Result<()> = { + while batches_remaining > 0 && bytes_consumed < MAX_BYTES { + let next_in_line = self.log_fetch_buffer.next_in_line_fetch(); - match next_in_line { - Some(mut next_fetch) if !next_fetch.is_consumed() => { - let batches = - self.fetch_batches_from_fetch(&mut next_fetch, batches_remaining)?; - let batch_count = batches.len(); + match next_in_line { + Some(mut next_fetch) if !next_fetch.is_consumed() => { + let batches = + self.fetch_batches_from_fetch(&mut next_fetch, batches_remaining)?; + let batch_count = batches.len(); - if !batches.is_empty() { - // Track bytes consumed (soft cap - may exceed by one fetch) - let batch_bytes: usize = - batches.iter().map(|b| b.get_array_memory_size()).sum(); - bytes_consumed += batch_bytes; + if !batches.is_empty() { + // Track bytes consumed (soft cap - may exceed by one fetch) + let batch_bytes: usize = + batches.iter().map(|b| b.get_array_memory_size()).sum(); + bytes_consumed += batch_bytes; - result.extend(batches); - batches_remaining = batches_remaining.saturating_sub(batch_count); - } + result.extend(batches); + batches_remaining = batches_remaining.saturating_sub(batch_count); + } - if !next_fetch.is_consumed() { - self.log_fetch_buffer - .set_next_in_line_fetch(Some(next_fetch)); + if !next_fetch.is_consumed() { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(next_fetch)); + } } - } - _ => { - if let Some(completed_fetch) = self.log_fetch_buffer.poll() { - if !completed_fetch.is_initialized() { - let size_in_bytes = completed_fetch.size_in_bytes(); - match self.initialize_fetch(completed_fetch) { - Ok(initialized) => { - self.log_fetch_buffer.set_next_in_line_fetch(initialized); - continue; - } - Err(e) => { - if result.is_empty() && size_in_bytes == 0 { + _ => { + if let Some(completed_fetch) = self.log_fetch_buffer.poll() { + if !completed_fetch.is_initialized() { + let size_in_bytes = completed_fetch.size_in_bytes(); + match self.initialize_fetch(completed_fetch) { + Ok(initialized) => { + self.log_fetch_buffer.set_next_in_line_fetch(initialized); continue; } - return Err(e); + Err(e) => { + if result.is_empty() && size_in_bytes == 0 { + continue; + } + return Err(e); + } } + } else { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(completed_fetch)); } } else { - self.log_fetch_buffer - .set_next_in_line_fetch(Some(completed_fetch)); + break; } - } else { - break; } } } - } + Ok(()) + }; - Ok(result) + match collect_result { + Ok(()) => Ok(result), + Err(e) => { + if result.is_empty() { + Err(e) + } else { + Ok(result) + } + } + } } fn fetch_batches_from_fetch( @@ -1231,3 +1421,175 @@ impl BucketScanStatus { *self.high_watermark.write() = high_watermark } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::WriteRecord; + use crate::client::metadata::Metadata; + use crate::compression::{ + ArrowCompressionInfo, ArrowCompressionType, DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }; + use crate::metadata::{TableInfo, TablePath}; + use crate::record::MemoryLogRecordsArrowBuilder; + use crate::row::{Datum, GenericRow}; + use crate::rpc::FlussError; + use crate::test_utils::{build_cluster_arc, build_table_info}; + + fn build_records(table_info: &TableInfo, table_path: Arc) -> Result> { + let mut builder = MemoryLogRecordsArrowBuilder::new( + 1, + table_info.get_row_type(), + false, + ArrowCompressionInfo { + compression_type: ArrowCompressionType::None, + compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }, + ); + let record = WriteRecord::new( + table_path, + GenericRow { + values: vec![Datum::Int32(1)], + }, + ); + builder.append(&record)?; + builder.build() + } + + #[tokio::test] + async fn collect_fetches_updates_offset() -> Result<()> { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = build_table_info(table_path.clone(), 1, 1); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster)); + let status = Arc::new(LogScannerStatus::new()); + let fetcher = LogFetcher::new( + table_info.clone(), + Arc::new(RpcClient::new()), + metadata, + status.clone(), + None, + )?; + + let bucket = TableBucket::new(1, 0); + status.assign_scan_bucket(bucket.clone(), 0); + + let data = build_records(&table_info, Arc::new(table_path))?; + let log_records = LogRecordsBatches::new(data.clone()); + let read_context = ReadContext::new(to_arrow_schema(table_info.get_row_type()), false); + let completed = + DefaultCompletedFetch::new(bucket.clone(), log_records, data.len(), read_context, 0, 0); + fetcher.log_fetch_buffer.add(Box::new(completed)); + + let fetched = fetcher.collect_fetches()?; + assert_eq!(fetched.get(&bucket).unwrap().len(), 1); + assert_eq!(status.get_bucket_offset(&bucket), Some(1)); + Ok(()) + } + + #[test] + fn fetch_records_from_fetch_drains_unassigned_bucket() -> Result<()> { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = build_table_info(table_path.clone(), 1, 1); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster)); + let status = Arc::new(LogScannerStatus::new()); + let fetcher = LogFetcher::new( + table_info.clone(), + Arc::new(RpcClient::new()), + metadata, + status, + None, + )?; + + let bucket = TableBucket::new(1, 0); + let data = build_records(&table_info, Arc::new(table_path))?; + let log_records = LogRecordsBatches::new(data.clone()); + let read_context = ReadContext::new(to_arrow_schema(table_info.get_row_type()), false); + let mut completed: Box = Box::new(DefaultCompletedFetch::new( + bucket, + log_records, + data.len(), + read_context, + 0, + 0, + )); + + let records = fetcher.fetch_records_from_fetch(&mut completed, 10)?; + assert!(records.is_empty()); + assert!(completed.is_consumed()); + Ok(()) + } + + #[tokio::test] + async fn prepare_fetch_log_requests_skips_pending() -> Result<()> { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = build_table_info(table_path.clone(), 1, 1); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster)); + let status = Arc::new(LogScannerStatus::new()); + status.assign_scan_bucket(TableBucket::new(1, 0), 0); + let fetcher = LogFetcher::new( + table_info, + Arc::new(RpcClient::new()), + metadata, + status, + None, + )?; + + fetcher.nodes_with_pending_fetch_requests.lock().insert(1); + + let requests = fetcher.prepare_fetch_log_requests().await; + assert!(requests.is_empty()); + Ok(()) + } + + #[tokio::test] + async fn handle_fetch_response_sets_error() -> Result<()> { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = build_table_info(table_path.clone(), 1, 1); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster)); + let status = Arc::new(LogScannerStatus::new()); + status.assign_scan_bucket(TableBucket::new(1, 0), 5); + let fetcher = LogFetcher::new( + table_info.clone(), + Arc::new(RpcClient::new()), + metadata.clone(), + status.clone(), + None, + )?; + + let response = crate::proto::FetchLogResponse { + tables_resp: vec![crate::proto::PbFetchLogRespForTable { + table_id: 1, + buckets_resp: vec![crate::proto::PbFetchLogRespForBucket { + partition_id: None, + bucket_id: 0, + error_code: Some(FlussError::AuthorizationException.code()), + error_message: Some("denied".to_string()), + high_watermark: None, + log_start_offset: None, + remote_log_fetch_info: None, + records: None, + }], + }], + }; + + LogFetcher::handle_fetch_response( + response, + &fetcher.log_fetch_buffer, + &fetcher.log_scanner_status, + &fetcher.read_context, + &fetcher.remote_read_context, + &fetcher.remote_log_downloader, + &fetcher.credentials_cache, + ) + .await; + + let completed = fetcher.log_fetch_buffer.poll().expect("completed fetch"); + let api_error = completed.api_error().expect("api error"); + assert_eq!(api_error.code, FlussError::AuthorizationException.code()); + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/sender.rs b/fluss-rust/crates/fluss/src/client/write/sender.rs index cb03a2c462..ffac0af8ae 100644 --- a/fluss-rust/crates/fluss/src/client/write/sender.rs +++ b/fluss-rust/crates/fluss/src/client/write/sender.rs @@ -455,7 +455,7 @@ mod tests { use crate::row::{Datum, GenericRow}; use crate::rpc::FlussError; use crate::test_utils::build_cluster_arc; - use std::collections::HashSet; + use std::collections::{HashMap, HashSet}; async fn build_ready_batch( accumulator: &RecordAccumulator, diff --git a/fluss-rust/crates/fluss/src/error.rs b/fluss-rust/crates/fluss/src/error.rs index 0a368b7be9..368d8abc80 100644 --- a/fluss-rust/crates/fluss/src/error.rs +++ b/fluss-rust/crates/fluss/src/error.rs @@ -99,6 +99,11 @@ pub enum Error { )] IoUnsupported { message: String }, + #[snafu( + visibility(pub(crate)), + display("Fluss hitting wakeup error {}.", message) + )] + WakeupError { message: String }, #[snafu( visibility(pub(crate)), display("Fluss hitting unsupported operation error {}.", message) diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index 89fb7b9c94..c166ebe8f6 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -17,7 +17,7 @@ use crate::client::{Record, WriteRecord}; use crate::compression::ArrowCompressionInfo; -use crate::error::Result; +use crate::error::{Error, Result}; use crate::metadata::DataType; use crate::record::{ChangeType, ScanRecord}; use crate::row::{ColumnarRow, GenericRow}; @@ -446,7 +446,7 @@ impl LogRecordBatch { } pub fn ensure_valid(&self) -> Result<()> { - // todo + // TODO enable validation once checksum handling is corrected. Ok(()) } @@ -780,8 +780,10 @@ impl ReadContext { arrow_schema: SchemaRef, projected_fields: Vec, is_from_remote: bool, - ) -> ReadContext { - let target_schema = Self::project_schema(arrow_schema.clone(), projected_fields.as_slice()); + ) -> Result { + Self::validate_projection(&arrow_schema, projected_fields.as_slice())?; + let target_schema = + Self::project_schema(arrow_schema.clone(), projected_fields.as_slice())?; // the logic is little bit of hard to understand, to refactor it to follow // java side let (need_do_reorder, sorted_fields) = { @@ -804,16 +806,20 @@ impl ReadContext { // Calculate reordering indexes to transform from sorted order to user-requested order let mut reordering_indexes = Vec::with_capacity(projected_fields.len()); for &original_idx in &projected_fields { - let pos = sorted_fields - .binary_search(&original_idx) - .expect("projection index should exist in sorted list"); + let pos = sorted_fields.binary_search(&original_idx).map_err(|_| { + Error::IllegalArgument { + message: format!( + "Projection index {original_idx} is invalid for the current schema." + ), + } + })?; reordering_indexes.push(pos); } Projection { ordered_schema: Self::project_schema( arrow_schema.clone(), sorted_fields.as_slice(), - ), + )?, projected_fields, ordered_fields: sorted_fields, reordering_indexes, @@ -824,7 +830,7 @@ impl ReadContext { ordered_schema: Self::project_schema( arrow_schema.clone(), projected_fields.as_slice(), - ), + )?, ordered_fields: projected_fields.clone(), projected_fields, reordering_indexes: vec![], @@ -833,21 +839,34 @@ impl ReadContext { } }; - ReadContext { + Ok(ReadContext { target_schema, full_schema: arrow_schema, projection: Some(project), is_from_remote, + }) + } + + fn validate_projection(schema: &SchemaRef, projected_fields: &[usize]) -> Result<()> { + let field_count = schema.fields().len(); + for &index in projected_fields { + if index >= field_count { + return Err(Error::IllegalArgument { + message: format!( + "Projection index {index} is out of bounds for schema with {field_count} fields." + ), + }); + } } + Ok(()) } - pub fn project_schema(schema: SchemaRef, projected_fields: &[usize]) -> SchemaRef { - // todo: handle the exception - SchemaRef::new( - schema - .project(projected_fields) - .expect("can't project schema"), - ) + pub fn project_schema(schema: SchemaRef, projected_fields: &[usize]) -> Result { + Ok(SchemaRef::new(schema.project(projected_fields).map_err( + |e| Error::IllegalArgument { + message: format!("Invalid projection: {e}"), + }, + )?)) } pub fn project_fields(&self) -> Option<&[usize]> { @@ -1035,6 +1054,8 @@ pub struct MyVec(pub StreamReader); #[cfg(test)] mod tests { use super::*; + use crate::error::Error; + use crate::metadata::DataField; use crate::metadata::DataTypes; #[test] @@ -1207,6 +1228,18 @@ mod tests { ); } + #[test] + fn projection_rejects_out_of_bounds_index() { + let row_type = DataTypes::row(vec![ + DataField::new("id".to_string(), DataTypes::int(), None), + DataField::new("name".to_string(), DataTypes::string(), None), + ]); + let schema = to_arrow_schema(&row_type); + let result = ReadContext::with_projection_pushdown(schema, vec![0, 2], false); + + assert!(matches!(result, Err(Error::IllegalArgument { .. }))); + } + fn le_bytes(vals: &[u32]) -> Vec { let mut out = Vec::with_capacity(vals.len() * 4); for &v in vals { diff --git a/fluss-rust/crates/fluss/src/record/mod.rs b/fluss-rust/crates/fluss/src/record/mod.rs index c5a3f8e4b6..94997e8d80 100644 --- a/fluss-rust/crates/fluss/src/record/mod.rs +++ b/fluss-rust/crates/fluss/src/record/mod.rs @@ -182,3 +182,65 @@ impl IntoIterator for ScanRecords { .into_iter() } } + +#[cfg(test)] +mod tests { + use super::*; + use ::arrow::array::{Int32Array, RecordBatch}; + use ::arrow::datatypes::{DataType, Field, Schema}; + use std::sync::Arc; + + fn make_row(values: Vec, row_id: usize) -> ColumnarRow { + let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, false)])); + let batch = RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(values))]) + .expect("record batch"); + ColumnarRow::new_with_row_id(Arc::new(batch), row_id) + } + + #[test] + fn change_type_round_trip() { + let cases = [ + (ChangeType::AppendOnly, "+A", 0), + (ChangeType::Insert, "+I", 1), + (ChangeType::UpdateBefore, "-U", 2), + (ChangeType::UpdateAfter, "+U", 3), + (ChangeType::Delete, "-D", 4), + ]; + + for (change_type, short, byte) in cases { + assert_eq!(change_type.short_string(), short); + assert_eq!(change_type.to_byte_value(), byte); + assert_eq!(ChangeType::from_byte_value(byte).unwrap(), change_type); + } + + let err = ChangeType::from_byte_value(9).unwrap_err(); + assert!(err.contains("Unsupported byte value")); + } + + #[test] + fn scan_records_counts_and_iterates() { + let bucket0 = TableBucket::new(1, 0); + let bucket1 = TableBucket::new(1, 1); + let record0 = ScanRecord::new(make_row(vec![10, 11], 0), 5, 7, ChangeType::Insert); + let record1 = ScanRecord::new(make_row(vec![10, 11], 1), 6, 8, ChangeType::Delete); + + let mut records = HashMap::new(); + records.insert(bucket0.clone(), vec![record0.clone(), record1.clone()]); + + let scan_records = ScanRecords::new(records); + assert_eq!(scan_records.records(&bucket0).len(), 2); + assert!(scan_records.records(&bucket1).is_empty()); + assert_eq!(scan_records.count(), 2); + + let collected: Vec<_> = scan_records.into_iter().collect(); + assert_eq!(collected.len(), 2); + } + + #[test] + fn scan_record_default_values() { + let record = ScanRecord::new_default(make_row(vec![1], 0)); + assert_eq!(record.offset(), -1); + assert_eq!(record.timestamp(), -1); + assert_eq!(record.change_type(), &ChangeType::Insert); + } +} diff --git a/fluss-rust/crates/fluss/src/row/column.rs b/fluss-rust/crates/fluss/src/row/column.rs index 31f0fdf298..90437c11aa 100644 --- a/fluss-rust/crates/fluss/src/row/column.rs +++ b/fluss-rust/crates/fluss/src/row/column.rs @@ -166,3 +166,67 @@ impl InternalRow for ColumnarRow { .value(self.row_id) } } + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{ + BinaryArray, BooleanArray, FixedSizeBinaryArray, Float32Array, Float64Array, Int8Array, + Int16Array, Int32Array, Int64Array, StringArray, + }; + use arrow::datatypes::{DataType, Field, Schema}; + + #[test] + fn columnar_row_reads_values() { + let schema = Arc::new(Schema::new(vec![ + Field::new("b", DataType::Boolean, false), + Field::new("i8", DataType::Int8, false), + Field::new("i16", DataType::Int16, false), + Field::new("i32", DataType::Int32, false), + Field::new("i64", DataType::Int64, false), + Field::new("f32", DataType::Float32, false), + Field::new("f64", DataType::Float64, false), + Field::new("s", DataType::Utf8, false), + Field::new("bin", DataType::Binary, false), + Field::new("char", DataType::FixedSizeBinary(2), false), + ])); + + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(BooleanArray::from(vec![true])), + Arc::new(Int8Array::from(vec![1])), + Arc::new(Int16Array::from(vec![2])), + Arc::new(Int32Array::from(vec![3])), + Arc::new(Int64Array::from(vec![4])), + Arc::new(Float32Array::from(vec![1.25])), + Arc::new(Float64Array::from(vec![2.5])), + Arc::new(StringArray::from(vec!["hello"])), + Arc::new(BinaryArray::from(vec![b"data".as_slice()])), + Arc::new( + FixedSizeBinaryArray::try_from_sparse_iter_with_size( + vec![Some(b"ab".as_slice())].into_iter(), + 2, + ) + .expect("fixed array"), + ), + ], + ) + .expect("record batch"); + + let mut row = ColumnarRow::new(Arc::new(batch)); + assert_eq!(row.get_field_count(), 10); + assert!(row.get_boolean(0)); + assert_eq!(row.get_byte(1), 1); + assert_eq!(row.get_short(2), 2); + assert_eq!(row.get_int(3), 3); + assert_eq!(row.get_long(4), 4); + assert_eq!(row.get_float(5), 1.25); + assert_eq!(row.get_double(6), 2.5); + assert_eq!(row.get_string(7), "hello"); + assert_eq!(row.get_bytes(8), b"data"); + assert_eq!(row.get_char(9, 2), "ab"); + row.set_row_id(0); + assert_eq!(row.get_row_id(), 0); + } +} diff --git a/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs b/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs index 9ab1f143f3..fcecb4188c 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs @@ -17,9 +17,9 @@ use crate::{impl_read_version_type, impl_write_version_type, proto}; -use crate::error::Error; use crate::error::Result as FlussResult; -use crate::proto::ListOffsetsResponse; +use crate::error::{Error, FlussError}; +use crate::proto::{ErrorResponse, ListOffsetsResponse}; use crate::rpc::frame::ReadError; use crate::rpc::api_key::ApiKey; @@ -108,22 +108,48 @@ impl ListOffsetsResponse { self.buckets_resp .iter() .map(|resp| { - if resp.error_code.is_some() { - // todo: consider use another suitable error - Err(Error::UnexpectedError { + if let Some(error_code) = resp.error_code + && error_code != FlussError::None.code() + { + let api_error = ErrorResponse { + error_code, + error_message: resp.error_message.clone(), + } + .into(); + return Err(Error::FlussAPIError { api_error }); + } + // if no error msg, offset must exists + resp.offset + .map(|offset| (resp.bucket_id, offset)) + .ok_or_else(|| Error::UnexpectedError { message: format!( - "Missing offset, error message: {}", - resp.error_message - .as_deref() - .unwrap_or("unknown server exception") + "Missing offset for bucket {} without error code.", + resp.bucket_id ), source: None, }) - } else { - // if no error msg, offset must exists - Ok((resp.bucket_id, resp.offset.unwrap())) - } }) .collect() } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::proto::{ListOffsetsResponse, PbListOffsetsRespForBucket}; + + #[test] + fn offsets_returns_api_error_on_error_code() { + let response = ListOffsetsResponse { + buckets_resp: vec![PbListOffsetsRespForBucket { + bucket_id: 1, + error_code: Some(FlussError::TableNotExist.code()), + error_message: Some("missing".to_string()), + offset: None, + }], + }; + + let result = response.offsets(); + assert!(matches!(result, Err(Error::FlussAPIError { .. }))); + } +} diff --git a/fluss-rust/crates/fluss/src/util/mod.rs b/fluss-rust/crates/fluss/src/util/mod.rs index d191615e86..30424e5d16 100644 --- a/fluss-rust/crates/fluss/src/util/mod.rs +++ b/fluss-rust/crates/fluss/src/util/mod.rs @@ -184,3 +184,57 @@ impl Default for FairBucketStatusMap { Self::new() } } + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::Arc; + + #[test] + fn fair_bucket_status_map_tracks_order_and_size() { + let bucket0 = TableBucket::new(1, 0); + let bucket1 = TableBucket::new(1, 1); + + let mut map = FairBucketStatusMap::new(); + map.update_and_move_to_end(bucket0.clone(), 10); + map.update_and_move_to_end(bucket1.clone(), 20); + assert_eq!(map.size(), 2); + + let values: Vec = map + .bucket_status_values() + .into_iter() + .map(|value| **value) + .collect(); + assert_eq!(values, vec![10, 20]); + + map.move_to_end(bucket0.clone()); + let values: Vec = map + .bucket_status_values() + .into_iter() + .map(|value| **value) + .collect(); + assert_eq!(values, vec![20, 10]); + } + + #[test] + fn fair_bucket_status_map_mutations() { + let bucket0 = TableBucket::new(1, 0); + let bucket1 = TableBucket::new(2, 1); + + let mut map = FairBucketStatusMap::new(); + let mut input = HashMap::new(); + input.insert(bucket0.clone(), Arc::new(1)); + input.insert(bucket1.clone(), Arc::new(2)); + map.set(input); + + assert!(map.contains(&bucket0)); + assert!(map.contains(&bucket1)); + assert_eq!(map.bucket_set().len(), 2); + + map.remove(&bucket1); + assert_eq!(map.size(), 1); + + map.clear(); + assert_eq!(map.size(), 0); + } +} From ffc026ddcab8c22c281909afaa75265774b374e3 Mon Sep 17 00:00:00 2001 From: AlexZhao Date: Sat, 17 Jan 2026 15:37:07 +0800 Subject: [PATCH 072/287] chore: correct log record batch checksum bounds (#172) --- fluss-rust/crates/fluss/src/record/arrow.rs | 30 ++++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index c166ebe8f6..b331ae9d7f 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -86,6 +86,8 @@ pub enum LogMagicValue { V0 = 0, } +// NOTE: Rust layout/offsets currently match Java only for V0. +// TODO: Add V1 layout/offsets to keep parity with Java's V1 format. pub const CURRENT_LOG_MAGIC_VALUE: u8 = LogMagicValue::V0 as u8; /// Value used if writer ID is not available or non-idempotent. @@ -457,8 +459,7 @@ impl LogRecordBatch { fn compute_checksum(&self) -> u32 { let start = SCHEMA_ID_OFFSET; - let end = start + self.data.len(); - crc32c(&self.data[start..end]) + crc32c(&self.data[start..]) } fn attributes(&self) -> u8 { @@ -471,12 +472,12 @@ impl LogRecordBatch { pub fn checksum(&self) -> u32 { let offset = CRC_OFFSET; - LittleEndian::read_u32(&self.data[offset..offset + CRC_OFFSET]) + LittleEndian::read_u32(&self.data[offset..offset + CRC_LENGTH]) } pub fn schema_id(&self) -> i16 { let offset = SCHEMA_ID_OFFSET; - LittleEndian::read_i16(&self.data[offset..offset + SCHEMA_ID_OFFSET]) + LittleEndian::read_i16(&self.data[offset..offset + SCHEMA_ID_LENGTH]) } pub fn base_log_offset(&self) -> i64 { @@ -1240,6 +1241,27 @@ mod tests { assert!(matches!(result, Err(Error::IllegalArgument { .. }))); } + #[test] + fn checksum_and_schema_id_read_minimum_header() { + // Header-only batches with record_count == 0 are valid; this covers the minimal bytes + // needed for checksum/schema_id access. + let mut data = vec![0u8; SCHEMA_ID_OFFSET + SCHEMA_ID_LENGTH]; + let crc = 0xA1B2C3D4u32; + let schema_id = 42i16; + LittleEndian::write_u32(&mut data[CRC_OFFSET..CRC_OFFSET + CRC_LENGTH], crc); + LittleEndian::write_i16( + &mut data[SCHEMA_ID_OFFSET..SCHEMA_ID_OFFSET + SCHEMA_ID_LENGTH], + schema_id, + ); + + let batch = LogRecordBatch::new(Bytes::from(data)); + assert_eq!(batch.checksum(), crc); + assert_eq!(batch.schema_id(), schema_id); + + let expected = crc32c(&batch.data[SCHEMA_ID_OFFSET..]); + assert_eq!(batch.compute_checksum(), expected); + } + fn le_bytes(vals: &[u32]) -> Vec { let mut out = Vec::with_capacity(vals.len() * 4); for &v in vals { From 41f824fe778ab648df844a295747c2da32cbb220 Mon Sep 17 00:00:00 2001 From: Anton Borisov <148864665+fresh-borzoni@users.noreply.github.com> Date: Sat, 17 Jan 2026 15:41:12 +0000 Subject: [PATCH 073/287] feat: Introduce KVReadContext and read path wiring (#174) --- .../crates/fluss/src/record/kv/kv_record.rs | 166 ++++++++-------- .../fluss/src/record/kv/kv_record_batch.rs | 115 ++++++++--- .../src/record/kv/kv_record_batch_builder.rs | 154 ++++++++------- .../src/record/kv/kv_record_read_context.rs | 179 ++++++++++++++++++ fluss-rust/crates/fluss/src/record/kv/mod.rs | 7 + .../fluss/src/record/kv/read_context.rs | 45 +++++ .../crates/fluss/src/record/kv/test_util.rs | 50 +++++ fluss-rust/crates/fluss/src/row/mod.rs | 2 + .../crates/fluss/src/row/row_decoder.rs | 137 ++++++++++++++ 9 files changed, 674 insertions(+), 181 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs create mode 100644 fluss-rust/crates/fluss/src/record/kv/read_context.rs create mode 100644 fluss-rust/crates/fluss/src/record/kv/test_util.rs create mode 100644 fluss-rust/crates/fluss/src/row/row_decoder.rs diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record.rs index ab8c2ac1dd..a9c45d6971 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record.rs @@ -27,6 +27,8 @@ use bytes::{BufMut, Bytes, BytesMut}; use std::io; +use crate::row::RowDecoder; +use crate::row::compacted::CompactedRow; use crate::util::varint::{ read_unsigned_varint_bytes, size_of_unsigned_varint, write_unsigned_varint_buf, }; @@ -34,7 +36,7 @@ use crate::util::varint::{ /// Length field size in bytes pub const LENGTH_LENGTH: usize = 4; -/// A key-value record. +/// A key-value record containing raw key and value bytes. /// /// The schema is: /// - Length => Int32 @@ -43,34 +45,39 @@ pub const LENGTH_LENGTH: usize = 4; /// - Value => bytes (BinaryRow, written directly without length prefix) /// /// When the value is None (deletion), no Value bytes are present. -// Reference implementation: -// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/record/KvRecord.java +/// +/// This struct stores only raw bytes. To decode the value into a typed row, +/// use the `row()` method with a RowDecoder (typically obtained from the iterator). +/// +/// Reference implementation: +/// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/record/KvRecord.java #[derive(Debug, Clone)] pub struct KvRecord { key: Bytes, - value: Option, + value_bytes: Option, size_in_bytes: usize, } impl KvRecord { - /// Create a new KvRecord with the given key and optional value. - pub fn new(key: Bytes, value: Option) -> Self { - let size_in_bytes = Self::size_of(&key, value.as_deref()); - Self { - key, - value, - size_in_bytes, - } - } - /// Get the key bytes. pub fn key(&self) -> &Bytes { &self.key } - /// Get the value bytes (None indicates a deletion). - pub fn value(&self) -> Option<&Bytes> { - self.value.as_ref() + /// Get the raw value bytes (for testing). + #[cfg(test)] + pub(crate) fn value_bytes(&self) -> Option<&Bytes> { + self.value_bytes.as_ref() + } + + /// Decode the value bytes into a typed row using the provided decoder. + /// This creates a lightweight CompactedRow view over the raw bytes. + /// Actual field parsing is lazy (on first access). + pub fn row<'a>(&'a self, decoder: &dyn RowDecoder) -> Option> { + self.value_bytes.as_ref().map(|bytes| { + // Decode on-demand - CompactedRow<'a> lifetime tied to &'a self + decoder.decode(bytes.as_ref()) + }) } /// Calculate the total size of the record when serialized (including length prefix). @@ -121,8 +128,7 @@ impl KvRecord { /// Read a KV record from bytes at the given position. /// /// Returns the KvRecord and the number of bytes consumed. - /// - /// TODO: Connect KvReadContext and return CompactedRow records. + /// The record contains only raw bytes; use `row()` with a RowDecoder to decode the value. pub fn read_from(bytes: &Bytes, position: usize) -> io::Result<(Self, usize)> { if bytes.len() < position.saturating_add(LENGTH_LENGTH) { return Err(io::Error::new( @@ -183,11 +189,10 @@ impl KvRecord { let key = bytes.slice(current_offset..key_end); current_offset = key_end; - // Read value bytes directly - let value = if current_offset < record_end { + // Read value bytes directly (don't decode yet - will decode on-demand) + let value_bytes = if current_offset < record_end { // Value is present: all remaining bytes are the value - let value_bytes = bytes.slice(current_offset..record_end); - Some(value_bytes) + Some(bytes.slice(current_offset..record_end)) } else { // No remaining bytes: this is a deletion record None @@ -196,7 +201,7 @@ impl KvRecord { Ok(( Self { key, - value, + value_bytes, size_in_bytes: total_size, }, total_size, @@ -207,6 +212,11 @@ impl KvRecord { pub fn get_size_in_bytes(&self) -> usize { self.size_in_bytes } + + /// Check if this is a deletion record (no value). + pub fn is_deletion(&self) -> bool { + self.value_bytes.is_none() + } } #[cfg(test)] @@ -214,30 +224,25 @@ mod tests { use super::*; #[test] - fn test_kv_record_size_calculation() { + fn test_kv_record_basic_operations() { let key = b"test_key"; let value = b"test_value"; - // With value (no value length varint) + // Test size calculation with value let size_with_value = KvRecord::size_of(key, Some(value)); assert_eq!( size_with_value, LENGTH_LENGTH + size_of_unsigned_varint(key.len() as u32) + key.len() + value.len() ); - // Without value + // Test size calculation without value (deletion) let size_without_value = KvRecord::size_of(key, None); assert_eq!( size_without_value, LENGTH_LENGTH + size_of_unsigned_varint(key.len() as u32) + key.len() ); - } - - #[test] - fn test_kv_record_write_read_round_trip() { - let key = b"my_key"; - let value = b"my_value_data"; + // Test write/read round trip with value let mut buf = BytesMut::new(); let written = KvRecord::write_to_buf(&mut buf, key, Some(value)).unwrap(); @@ -246,40 +251,70 @@ mod tests { assert_eq!(written, read_size); assert_eq!(record.key().as_ref(), key); - assert_eq!(record.value().unwrap().as_ref(), value); + assert_eq!(record.value_bytes().unwrap().as_ref(), value); assert_eq!(record.get_size_in_bytes(), written); - } - - #[test] - fn test_kv_record_deletion() { - let key = b"delete_me"; + assert!(!record.is_deletion()); - // Write deletion record (no value) + // Test deletion record (no value) + let delete_key = b"delete_me"; let mut buf = BytesMut::new(); - let written = KvRecord::write_to_buf(&mut buf, key, None).unwrap(); + let written = KvRecord::write_to_buf(&mut buf, delete_key, None).unwrap(); let bytes = buf.freeze(); let (record, read_size) = KvRecord::read_from(&bytes, 0).unwrap(); assert_eq!(written, read_size); - assert_eq!(record.key().as_ref(), key); - assert!(record.value().is_none()); + assert_eq!(record.key().as_ref(), delete_key); + assert!(record.is_deletion()); + assert!(record.value_bytes().is_none()); } #[test] - fn test_kv_record_with_large_key() { - let key = vec![0u8; 1024]; - let value = vec![1u8; 4096]; + fn test_kv_record_multiple_records() { + // Test multiple regular-sized records in buffer + let records = vec![ + (b"key1".as_slice(), Some(b"value1".as_slice())), + (b"key2".as_slice(), None), // Deletion + (b"key3".as_slice(), Some(b"value3".as_slice())), + ]; let mut buf = BytesMut::new(); - let written = KvRecord::write_to_buf(&mut buf, &key, Some(&value)).unwrap(); + for (key, value) in &records { + KvRecord::write_to_buf(&mut buf, key, *value).unwrap(); + } + + let bytes = buf.freeze(); + let mut offset = 0; + for (expected_key, expected_value) in &records { + let (record, size) = KvRecord::read_from(&bytes, offset).unwrap(); + assert_eq!(record.key().as_ref(), *expected_key); + match expected_value { + Some(v) => { + assert_eq!(record.value_bytes().unwrap().as_ref(), *v); + assert!(!record.is_deletion()); + } + None => { + assert!(record.is_deletion()); + assert!(record.value_bytes().is_none()); + } + } + offset += size; + } + assert_eq!(offset, bytes.len()); + + // Test large keys and values + let large_key = vec![0u8; 1024]; + let large_value = vec![1u8; 4096]; + + let mut buf = BytesMut::new(); + let written = KvRecord::write_to_buf(&mut buf, &large_key, Some(&large_value)).unwrap(); let bytes = buf.freeze(); let (record, read_size) = KvRecord::read_from(&bytes, 0).unwrap(); assert_eq!(written, read_size); - assert_eq!(record.key().len(), key.len()); - assert_eq!(record.value().unwrap().len(), value.len()); + assert_eq!(record.key().len(), large_key.len()); + assert_eq!(record.value_bytes().unwrap().len(), large_value.len()); } #[test] @@ -291,7 +326,9 @@ mod tests { let bytes = buf.freeze(); let result = KvRecord::read_from(&bytes, 0); assert!(result.is_err()); - assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidData); + if let Err(e) = result { + assert_eq!(e.kind(), io::ErrorKind::InvalidData); + } // Test overflow length let mut buf = BytesMut::new(); @@ -307,33 +344,8 @@ mod tests { let bytes = buf.freeze(); let result = KvRecord::read_from(&bytes, 0); assert!(result.is_err()); - assert_eq!(result.unwrap_err().kind(), io::ErrorKind::UnexpectedEof); - } - - #[test] - fn test_multiple_records_in_buffer() { - let records = vec![ - (b"key1".as_slice(), Some(b"value1".as_slice())), - (b"key2".as_slice(), None), - (b"key3".as_slice(), Some(b"value3".as_slice())), - ]; - - let mut buf = BytesMut::new(); - for (key, value) in &records { - KvRecord::write_to_buf(&mut buf, key, *value).unwrap(); + if let Err(e) = result { + assert_eq!(e.kind(), io::ErrorKind::UnexpectedEof); } - - let bytes = buf.freeze(); - let mut offset = 0; - for (expected_key, expected_value) in &records { - let (record, size) = KvRecord::read_from(&bytes, offset).unwrap(); - assert_eq!(record.key().as_ref(), *expected_key); - match expected_value { - Some(v) => assert_eq!(record.value().unwrap().as_ref(), *v), - None => assert!(record.value().is_none()), - } - offset += size; - } - assert_eq!(offset, bytes.len()); } } diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs index eb3c09ad34..32f712f82e 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs @@ -32,8 +32,11 @@ use bytes::Bytes; use std::io; +use std::sync::Arc; -use crate::record::kv::KvRecord; +use crate::error::Result; +use crate::record::kv::{KvRecord, ReadContext}; +use crate::row::RowDecoder; // Field lengths in bytes pub const LENGTH_LENGTH: usize = 4; @@ -253,38 +256,87 @@ impl KvRecordBatch { ])) } - /// Create an iterator over the records in this batch. - /// This validates the batch checksum before returning the iterator. + /// Create an iterable collection of records in this batch. + /// + /// This validates the batch checksum before returning the records. /// For trusted data paths, use `records_unchecked()` to skip validation. - pub fn records(&self) -> io::Result { + /// + /// Mirrors: KvRecordBatch.records(ReadContext) + pub fn records(&self, read_context: &dyn ReadContext) -> Result { if !self.is_valid() { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "Invalid batch checksum", - )); + return Err(crate::error::Error::IoUnexpectedError { + message: "Invalid batch checksum".to_string(), + source: io::Error::new(io::ErrorKind::InvalidData, "Invalid batch checksum"), + }); } - self.records_unchecked() + self.records_unchecked(read_context) } - /// Create an iterator over the records in this batch without validating the checksum - pub fn records_unchecked(&self) -> io::Result { + /// Create an iterable collection of records in this batch without validating the checksum. + pub fn records_unchecked(&self, read_context: &dyn ReadContext) -> Result { let size = self.size_in_bytes()?; let count = self.record_count()?; + let schema_id = self.schema_id()?; + if count < 0 { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - format!("Invalid record count: {count}"), - )); + return Err(crate::error::Error::IoUnexpectedError { + message: format!("Invalid record count: {count}"), + source: io::Error::new(io::ErrorKind::InvalidData, "Invalid record count"), + }); } - Ok(KvRecordIterator { - data: self.data.clone(), - position: self.position + RECORDS_OFFSET, - end: self.position + size, - remaining_count: count, + + // Get row decoder for this schema from context (cached) + let row_decoder = read_context.get_row_decoder(schema_id)?; + + Ok(KvRecords { + iter: KvRecordIterator { + data: self.data.clone(), + position: self.position + RECORDS_OFFSET, + end: self.position + size, + remaining_count: count, + }, + row_decoder, }) } } +/// Iterable collection of KV records with associated decoder. +/// +/// This wrapper provides both iteration capability and access to the row decoder +/// needed to decode record values into typed rows. +pub struct KvRecords { + iter: KvRecordIterator, + row_decoder: Arc, +} + +impl KvRecords { + /// Get a reference to the row decoder for decoding record values. + /// + /// Returns a reference tied to the lifetime of `&self`. + /// Use this when iterating by reference. + pub fn decoder(&self) -> &dyn RowDecoder { + &*self.row_decoder + } + + /// Get an owned Arc to the row decoder. + /// + /// Returns a cloned Arc that can outlive the KvRecords, + /// allowing you to grab it before consuming the iterator. + /// Useful if you must keep the decoder beyond the iterable’s lifetime(collect then decode style) + pub fn decoder_arc(&self) -> Arc { + Arc::clone(&self.row_decoder) + } +} + +impl IntoIterator for KvRecords { + type Item = io::Result; + type IntoIter = KvRecordIterator; + + fn into_iter(self) -> Self::IntoIter { + self.iter + } +} + /// Iterator over records in a KV record batch. pub struct KvRecordIterator { data: Bytes, @@ -319,7 +371,9 @@ impl Iterator for KvRecordIterator { mod tests { use super::*; use crate::metadata::{DataTypes, KvFormat, RowType}; + use crate::record::kv::test_util::TestReadContext; use crate::record::kv::{CURRENT_KV_MAGIC_VALUE, KvRecordBatchBuilder}; + use crate::row::InternalRow; use crate::row::binary::BinaryWriter; use crate::row::compacted::CompactedRow; use bytes::{BufMut, BytesMut}; @@ -380,15 +434,24 @@ mod tests { assert_eq!(batch.batch_sequence().unwrap(), 5); assert_eq!(batch.record_count().unwrap(), 2); - let records: Vec<_> = batch.records().unwrap().collect(); - assert_eq!(records.len(), 2); + // Create ReadContext for reading + let read_context = TestReadContext::compacted(vec![DataTypes::bytes()]); - let record1 = records[0].as_ref().unwrap(); + // Iterate and verify records using typed API + let records = batch.records(&read_context).unwrap(); + let decoder = records.decoder_arc(); // Get Arc before consuming + + let mut iter = records.into_iter(); + let record1 = iter.next().unwrap().unwrap(); assert_eq!(record1.key().as_ref(), key1); - assert_eq!(record1.value().unwrap().as_ref(), value1_writer.buffer()); + assert!(!record1.is_deletion()); + let row1 = record1.row(&*decoder).unwrap(); + assert_eq!(row1.get_bytes(0), &[1, 2, 3, 4, 5]); - let record2 = records[1].as_ref().unwrap(); + let record2 = iter.next().unwrap().unwrap(); assert_eq!(record2.key().as_ref(), key2); - assert!(record2.value().is_none()); + assert!(record2.is_deletion()); + + assert!(iter.next().is_none()); } } diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs index c36a86121b..636104d176 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs @@ -330,22 +330,20 @@ mod tests { } #[test] - fn test_builder_basic_workflow() { + fn test_builder_basic_operations() { + // Test basic workflow: initial state, writer state, append, close, build let schema_id = 42; let write_limit = 4096; let mut builder = KvRecordBatchBuilder::new(schema_id, write_limit, KvFormat::COMPACTED); - // Test initial state assert!(!builder.is_closed()); assert_eq!(builder.writer_id(), NO_WRITER_ID); assert_eq!(builder.batch_sequence(), NO_BATCH_SEQUENCE); - // Test writer state builder.set_writer_state(100, 5); assert_eq!(builder.writer_id(), 100); assert_eq!(builder.batch_sequence(), 5); - // Test appending records let key1 = b"key1"; let value1 = create_test_row(b"value1"); assert!(builder.has_room_for_row(key1, Some(&value1))); @@ -355,7 +353,6 @@ mod tests { assert!(builder.has_room_for_row::(key2, None)); builder.append_row::(key2, None).unwrap(); - // Test close and build builder.close().unwrap(); assert!(builder.is_closed()); @@ -365,11 +362,8 @@ mod tests { // Building again should return cached result let bytes2 = builder.build().unwrap(); assert_eq!(bytes.len(), bytes2.len()); - } - #[test] - fn test_builder_lifecycle() { - // Test abort behavior + // Test lifecycle: abort behavior let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); let value = create_test_row(b"value"); builder.append_row(b"key", Some(&value)).unwrap(); @@ -378,13 +372,30 @@ mod tests { assert!(builder.build().is_err()); assert!(builder.close().is_err()); - // Test close behavior + // Test lifecycle: close behavior let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); let value = create_test_row(b"value"); builder.append_row(b"key", Some(&value)).unwrap(); builder.close().unwrap(); - assert!(builder.append_row::(b"key2", None).is_err()); // Can't append after close - assert!(builder.build().is_ok()); // But can still build + assert!(builder.append_row::(b"key2", None).is_err()); + assert!(builder.build().is_ok()); + + // Test KvFormat validation + let mut row_writer = CompactedRowWriter::new(1); + row_writer.write_int(42); + let row_type = RowType::with_data_types(vec![DataTypes::int()]); + let row = &CompactedRow::from_bytes(&row_type, row_writer.buffer()); + + // INDEXED format should reject append_row + let mut indexed_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::INDEXED); + let result = indexed_builder.append_row(b"key", Some(row)); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidInput); + + // COMPACTED format should accept append_row + let mut compacted_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); + let result = compacted_builder.append_row(b"key", Some(row)); + assert!(result.is_ok()); } #[test] @@ -430,7 +441,10 @@ mod tests { } #[test] - fn test_cache_invalidation_on_append() { + fn test_builder_cache_invalidation() { + use crate::record::kv::KvRecordBatch; + + // Test cache invalidation on append let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); builder.set_writer_state(100, 5); @@ -446,18 +460,13 @@ mod tests { let len2 = bytes2.len(); // Verify the second build includes both records - assert!(len2 > len1, "Second build should be larger"); - - use crate::record::kv::KvRecordBatch; + assert!(len2 > len1); let batch = KvRecordBatch::new(bytes2, 0); assert!(batch.is_valid()); - assert_eq!(batch.record_count().unwrap(), 2, "Should have 2 records"); - } + assert_eq!(batch.record_count().unwrap(), 2); - #[test] - fn test_cache_invalidation_on_set_writer_state() { + // Test cache invalidation on writer state change let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); - builder.set_writer_state(100, 5); let value = create_test_row(b"value"); builder.append_row(b"key", Some(&value)).unwrap(); @@ -467,24 +476,19 @@ mod tests { builder.set_writer_state(200, 10); let bytes2 = builder.build().unwrap(); - assert_ne!( - bytes1, bytes2, - "Bytes should differ after writer state change" - ); + assert_ne!(bytes1, bytes2); - use crate::record::kv::KvRecordBatch; let batch1 = KvRecordBatch::new(bytes1, 0); let batch2 = KvRecordBatch::new(bytes2, 0); assert_eq!(batch1.writer_id().unwrap(), 100); assert_eq!(batch1.batch_sequence().unwrap(), 5); - assert_eq!(batch2.writer_id().unwrap(), 200); assert_eq!(batch2.batch_sequence().unwrap(), 10); } #[test] - fn test_builder_with_compacted_row_writer() { + fn test_builder_with_compacted_row_writer() -> crate::error::Result<()> { use crate::record::kv::KvRecordBatch; use crate::row::InternalRow; use crate::row::compacted::CompactedRow; @@ -502,7 +506,7 @@ mod tests { let key1 = b"key1"; assert!(builder.has_room_for_row(key1, Some(row1))); - builder.append_row(key1, Some(row1)).unwrap(); + builder.append_row(key1, Some(row1))?; // Create and append second record let mut row_writer2 = CompactedRowWriter::new(2); @@ -512,63 +516,57 @@ mod tests { let row2 = &CompactedRow::from_bytes(&row_type, row_writer2.buffer()); let key2 = b"key2"; - builder.append_row(key2, Some(row2)).unwrap(); + builder.append_row(key2, Some(row2))?; // Append a deletion record let key3 = b"key3"; - builder.append_row::(key3, None).unwrap(); + builder.append_row::(key3, None)?; // Build and verify - builder.close().unwrap(); - let bytes = builder.build().unwrap(); + builder.close()?; + let bytes = builder.build()?; let batch = KvRecordBatch::new(bytes, 0); assert!(batch.is_valid()); - assert_eq!(batch.record_count().unwrap(), 3); - assert_eq!(batch.writer_id().unwrap(), 100); - assert_eq!(batch.batch_sequence().unwrap(), 5); - - // Read back and verify records - let records: Vec<_> = batch.records().unwrap().collect(); - assert_eq!(records.len(), 3); - - // Verify first record - let record1 = records[0].as_ref().unwrap(); - assert_eq!(record1.key().as_ref(), key1); - let row1 = CompactedRow::from_bytes(&row_type, record1.value().unwrap()); - assert_eq!(row1.get_int(0), 42); - assert_eq!(row1.get_string(1), "hello"); - - // Verify second record - let record2 = records[1].as_ref().unwrap(); - assert_eq!(record2.key().as_ref(), key2); - let row2 = CompactedRow::from_bytes(&row_type, record2.value().unwrap()); - assert_eq!(row2.get_int(0), 100); - assert_eq!(row2.get_string(1), "world"); - - // Verify deletion record - let record3 = records[2].as_ref().unwrap(); - assert_eq!(record3.key().as_ref(), key3); - assert!(record3.value().is_none()); - } - - #[test] - fn test_kv_format_validation() { - let mut row_writer = CompactedRowWriter::new(1); - row_writer.write_int(42); - - let row_type = RowType::with_data_types([DataTypes::int()].to_vec()); - let row = &CompactedRow::from_bytes(&row_type, row_writer.buffer()); - - // INDEXED format should reject append_row - let mut indexed_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::INDEXED); - let result = indexed_builder.append_row(b"key", Some(row)); - assert!(result.is_err()); - assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidInput); + assert_eq!(batch.record_count()?, 3); + assert_eq!(batch.writer_id()?, 100); + assert_eq!(batch.batch_sequence()?, 5); + + // Create ReadContext for reading typed rows + let types = vec![DataTypes::int(), DataTypes::string()]; + let read_context = crate::record::kv::test_util::TestReadContext::compacted(types); + + // Read back and verify records using idiomatic for-loop + let records = batch.records(&read_context)?; + let decoder = records.decoder_arc(); + let mut record_count = 0; + + for rec in records { + let rec = rec?; + record_count += 1; + + match record_count { + 1 => { + assert_eq!(rec.key().as_ref(), key1); + let row = rec.row(&*decoder).unwrap(); + assert_eq!(row.get_int(0), 42); + assert_eq!(row.get_string(1), "hello"); + } + 2 => { + assert_eq!(rec.key().as_ref(), key2); + let row = rec.row(&*decoder).unwrap(); + assert_eq!(row.get_int(0), 100); + assert_eq!(row.get_string(1), "world"); + } + 3 => { + assert_eq!(rec.key().as_ref(), key3); + assert!(rec.is_deletion()); + } + _ => panic!("Unexpected record count"), + } + } - // COMPACTED format should accept append_row - let mut compacted_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); - let result = compacted_builder.append_row(b"key", Some(row)); - assert!(result.is_ok()); + assert_eq!(record_count, 3); + Ok(()) } } diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs new file mode 100644 index 0000000000..2049c32680 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Default implementation of ReadContext with decoder caching. + +use super::ReadContext; +use crate::error::{Error, Result}; +use crate::metadata::{KvFormat, Schema}; +use crate::row::{RowDecoder, RowDecoderFactory}; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +/// Trait for fetching schemas by ID. +/// +/// This trait abstracts schema retrieval, allowing different implementations +/// (e.g., from metadata store, cache, or test mocks). +pub trait SchemaGetter: Send + Sync { + /// Get the schema for the given schema ID. + /// + /// # Arguments + /// * `schema_id` - The schema ID to fetch + /// + /// # Returns + /// An Arc-wrapped Schema for the specified ID, or an error if the schema + /// cannot be fetched (missing ID, network error, etc.) + fn get_schema(&self, schema_id: i16) -> Result>; +} + +/// Default implementation of ReadContext with decoder caching. +/// +/// This implementation caches RowDecoders by schema ID for performance, +/// avoiding repeated schema lookups and decoder creation. +/// +/// Reference: org.apache.fluss.record.KvRecordReadContext +pub struct KvRecordReadContext { + kv_format: KvFormat, + schema_getter: Arc, + row_decoder_cache: Mutex>>, +} + +impl KvRecordReadContext { + /// Create a new KvRecordReadContext. + /// + /// # Arguments + /// * `kv_format` - The KV format (COMPACTED or INDEXED) + /// * `schema_getter` - The schema getter for fetching schemas by ID + /// + /// # Returns + /// A new KvRecordReadContext instance + pub fn new(kv_format: KvFormat, schema_getter: Arc) -> Self { + Self { + kv_format, + schema_getter, + row_decoder_cache: Mutex::new(HashMap::new()), + } + } +} + +impl ReadContext for KvRecordReadContext { + fn get_row_decoder(&self, schema_id: i16) -> Result> { + // First check: fast path + { + let cache = self + .row_decoder_cache + .lock() + .unwrap_or_else(|poisoned| poisoned.into_inner()); + if let Some(decoder) = cache.get(&schema_id) { + return Ok(Arc::clone(decoder)); + } + } // Release lock before expensive operations + + // Build decoder outside the lock to avoid blocking other threads + let schema = self.schema_getter.get_schema(schema_id)?; + let row_type = match schema.row_type() { + crate::metadata::DataType::Row(row_type) => row_type.clone(), + other => { + return Err(Error::IoUnexpectedError { + message: format!( + "Schema {} has invalid row type: expected Row, got {:?}", + schema_id, other + ), + source: std::io::Error::new( + std::io::ErrorKind::InvalidData, + "Invalid row type", + ), + }); + } + }; + + // Create decoder outside lock + let decoder = RowDecoderFactory::create(self.kv_format.clone(), row_type)?; + + // Second check: insert only if another thread didn't beat us to it + { + let mut cache = self + .row_decoder_cache + .lock() + .unwrap_or_else(|poisoned| poisoned.into_inner()); + // Check again - another thread might have inserted while we were building + if let Some(existing) = cache.get(&schema_id) { + return Ok(Arc::clone(existing)); + } + cache.insert(schema_id, Arc::clone(&decoder)); + } + + Ok(decoder) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{DataTypes, Schema}; + + struct MockSchemaGetter { + schema: Arc, + } + + impl MockSchemaGetter { + fn new(data_types: Vec) -> Self { + let mut builder = Schema::builder(); + for (i, dt) in data_types.iter().enumerate() { + builder = builder.column(&format!("field{}", i), dt.clone()); + } + let schema = builder.build().expect("Failed to build schema"); + + Self { + schema: Arc::new(schema), + } + } + } + + impl SchemaGetter for MockSchemaGetter { + fn get_schema(&self, _schema_id: i16) -> Result> { + Ok(Arc::clone(&self.schema)) + } + } + + #[test] + fn test_kv_record_read_context() { + // Test decoder caching for same schema ID + let schema_getter = Arc::new(MockSchemaGetter::new(vec![ + DataTypes::int(), + DataTypes::string(), + ])); + let read_context = KvRecordReadContext::new(KvFormat::COMPACTED, schema_getter); + + // Get decoder twice - should return the same instance (cached) + let decoder1 = read_context.get_row_decoder(42).unwrap(); + let decoder2 = read_context.get_row_decoder(42).unwrap(); + + // Verify same instance (Arc pointer equality) + assert!(Arc::ptr_eq(&decoder1, &decoder2)); + + // Test different schema IDs get different decoders + let schema_getter = Arc::new(MockSchemaGetter::new(vec![DataTypes::int()])); + let read_context = KvRecordReadContext::new(KvFormat::COMPACTED, schema_getter); + + let decoder1 = read_context.get_row_decoder(10).unwrap(); + let decoder2 = read_context.get_row_decoder(20).unwrap(); + + // Should be different instances + assert!(!Arc::ptr_eq(&decoder1, &decoder2)); + } +} diff --git a/fluss-rust/crates/fluss/src/record/kv/mod.rs b/fluss-rust/crates/fluss/src/record/kv/mod.rs index ecb762df16..857c5e5fc9 100644 --- a/fluss-rust/crates/fluss/src/record/kv/mod.rs +++ b/fluss-rust/crates/fluss/src/record/kv/mod.rs @@ -20,10 +20,17 @@ mod kv_record; mod kv_record_batch; mod kv_record_batch_builder; +mod kv_record_read_context; +mod read_context; + +#[cfg(test)] +mod test_util; pub use kv_record::{KvRecord, LENGTH_LENGTH as KV_RECORD_LENGTH_LENGTH}; pub use kv_record_batch::*; pub use kv_record_batch_builder::*; +pub use kv_record_read_context::{KvRecordReadContext, SchemaGetter}; +pub use read_context::ReadContext; /// Current KV magic value pub const CURRENT_KV_MAGIC_VALUE: u8 = 0; diff --git a/fluss-rust/crates/fluss/src/record/kv/read_context.rs b/fluss-rust/crates/fluss/src/record/kv/read_context.rs new file mode 100644 index 0000000000..63502613d1 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/read_context.rs @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Read context for KV record batches. +//! +//! Provides schema and decoder information needed for typed record reading. + +use crate::error::Result; +use crate::row::RowDecoder; +use std::sync::Arc; + +/// Context for reading KV records with type information. +/// +/// The ReadContext provides access to RowDecoders based on schema IDs, +/// enabling typed deserialization of KV record values. +/// +/// Reference: org.apache.fluss.record.KvRecordBatch.ReadContext +pub trait ReadContext: Send + Sync { + /// Get the row decoder for the given schema ID. + /// + /// The decoder is typically cached, so repeated calls with the same + /// schema ID should return the same decoder instance. + /// + /// # Arguments + /// * `schema_id` - The schema ID for which to get the decoder + /// + /// # Returns + /// An Arc-wrapped RowDecoder for the specified schema, or an error if + /// the schema is invalid or cannot be retrieved + fn get_row_decoder(&self, schema_id: i16) -> Result>; +} diff --git a/fluss-rust/crates/fluss/src/record/kv/test_util.rs b/fluss-rust/crates/fluss/src/record/kv/test_util.rs new file mode 100644 index 0000000000..50ab911db2 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/test_util.rs @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Test utilities for KV record reading. + +use super::ReadContext; +use crate::error::Result; +use crate::metadata::{DataType, KvFormat, RowType}; +use crate::row::{RowDecoder, RowDecoderFactory}; +use std::sync::Arc; + +/// Simple test-only ReadContext that creates decoders directly from data types. +/// +/// This bypasses the production Schema/SchemaGetter machinery for simpler tests. +pub(crate) struct TestReadContext { + kv_format: KvFormat, + data_types: Vec, +} + +impl TestReadContext { + /// Create a test context for COMPACTED format (most common case). + pub(crate) fn compacted(data_types: Vec) -> Self { + Self { + kv_format: KvFormat::COMPACTED, + data_types, + } + } +} + +impl ReadContext for TestReadContext { + fn get_row_decoder(&self, _schema_id: i16) -> Result> { + // Directly create decoder from data types - no Schema needed! + let row_type = RowType::with_data_types(self.data_types.clone()); + RowDecoderFactory::create(self.kv_format.clone(), row_type) + } +} diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index 3477f1de20..536409efde 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -23,11 +23,13 @@ pub mod binary; pub mod compacted; pub mod encode; mod field_getter; +mod row_decoder; pub use column::*; pub use compacted::CompactedRow; pub use datum::*; pub use encode::KeyEncoder; +pub use row_decoder::{CompactedRowDecoder, RowDecoder, RowDecoderFactory}; pub trait BinaryRow: InternalRow { /// Returns the binary representation of this row as a byte slice. diff --git a/fluss-rust/crates/fluss/src/row/row_decoder.rs b/fluss-rust/crates/fluss/src/row/row_decoder.rs new file mode 100644 index 0000000000..9f9b4217a2 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/row_decoder.rs @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Row decoder for deserializing binary row formats. +//! +//! Mirrors the Java org.apache.fluss.row.decode package. + +use crate::error::{Error, Result}; +use crate::metadata::{KvFormat, RowType}; +use crate::row::compacted::{CompactedRow, CompactedRowDeserializer}; +use std::sync::Arc; + +/// Decoder for creating BinaryRow from bytes. +/// +/// This trait provides an abstraction for decoding different row formats +/// (COMPACTED, INDEXED, etc.) from binary data. +/// +/// Reference: org.apache.fluss.row.decode.RowDecoder +pub trait RowDecoder: Send + Sync { + /// Decode bytes into a CompactedRow. + /// + /// The lifetime 'a ties the returned row to the input data, ensuring + /// the data remains valid as long as the row is used. + fn decode<'a>(&self, data: &'a [u8]) -> CompactedRow<'a>; +} + +/// Decoder for CompactedRow format. +/// +/// Uses the existing CompactedRow infrastructure for decoding. +/// This is a thin wrapper that implements the RowDecoder trait. +/// +/// Reference: org.apache.fluss.row.decode.CompactedRowDecoder +pub struct CompactedRowDecoder { + field_count: usize, + deserializer: Arc>, +} + +impl CompactedRowDecoder { + /// Create a new CompactedRowDecoder with the given row type. + pub fn new(row_type: RowType) -> Self { + let field_count = row_type.fields().len(); + let deserializer = Arc::new(CompactedRowDeserializer::new_from_owned(row_type)); + + Self { + field_count, + deserializer, + } + } +} + +impl RowDecoder for CompactedRowDecoder { + fn decode<'a>(&self, data: &'a [u8]) -> CompactedRow<'a> { + // Use existing CompactedRow::deserialize() infrastructure + CompactedRow::deserialize(Arc::clone(&self.deserializer), self.field_count, data) + } +} + +/// Factory for creating RowDecoders based on KvFormat. +/// +/// Reference: org.apache.fluss.row.decode.RowDecoder.create() +pub struct RowDecoderFactory; + +impl RowDecoderFactory { + /// Create a RowDecoder for the given format and row type. + pub fn create(kv_format: KvFormat, row_type: RowType) -> Result> { + match kv_format { + KvFormat::COMPACTED => Ok(Arc::new(CompactedRowDecoder::new(row_type))), + KvFormat::INDEXED => Err(Error::UnsupportedOperation { + message: "INDEXED format is not yet supported".to_string(), + }), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::DataTypes; + use crate::row::InternalRow; + use crate::row::binary::BinaryWriter; + use crate::row::compacted::CompactedRowWriter; + + #[test] + fn test_compacted_row_decoder() { + // Write a CompactedRow + let mut writer = CompactedRowWriter::new(2); + writer.write_int(42); + writer.write_string("hello"); + + let data = writer.to_bytes(); + + // Create decoder with RowType + let row_type = RowType::with_data_types(vec![DataTypes::int(), DataTypes::string()]); + let decoder = CompactedRowDecoder::new(row_type); + + // Decode + let row = decoder.decode(&data); + + // Verify + assert_eq!(row.get_field_count(), 2); + assert_eq!(row.get_int(0), 42); + assert_eq!(row.get_string(1), "hello"); + } + + #[test] + fn test_row_decoder_factory() { + let row_type = RowType::with_data_types(vec![DataTypes::int(), DataTypes::string()]); + let decoder = RowDecoderFactory::create(KvFormat::COMPACTED, row_type).unwrap(); + + // Write a row + let mut writer = CompactedRowWriter::new(2); + writer.write_int(100); + writer.write_string("world"); + let data = writer.to_bytes(); + + // Decode + let row = decoder.decode(&data); + + // Verify + assert_eq!(row.get_int(0), 100); + assert_eq!(row.get_string(1), "world"); + } +} From f519ad9f840fbb3cf7c86458b20803c72f2dc7c5 Mon Sep 17 00:00:00 2001 From: yuxia Luo Date: Sun, 18 Jan 2026 22:09:25 +0800 Subject: [PATCH 074/287] feat: introduce KvWriteBatch and PutKvRequest (#176) --- fluss-rust/bindings/python/src/table.rs | 11 +- fluss-rust/crates/fluss/build.rs | 5 +- .../crates/fluss/src/client/table/append.rs | 11 +- .../src/client/table/log_fetch_buffer.rs | 2 +- .../crates/fluss/src/client/table/scanner.rs | 3 +- .../crates/fluss/src/client/table/writer.rs | 5 +- .../fluss/src/client/write/accumulator.rs | 7 +- .../crates/fluss/src/client/write/batch.rs | 133 +++++++++++++++--- .../crates/fluss/src/client/write/mod.rs | 79 +++++++++-- .../crates/fluss/src/client/write/sender.rs | 15 +- .../crates/fluss/src/proto/fluss_api.proto | 28 ++++ fluss-rust/crates/fluss/src/record/arrow.rs | 17 ++- .../src/record/kv/kv_record_batch_builder.rs | 25 ++-- .../src/record/kv/kv_record_read_context.rs | 5 +- fluss-rust/crates/fluss/src/rpc/api_key.rs | 4 + .../crates/fluss/src/rpc/message/mod.rs | 1 + .../fluss/src/rpc/message/produce_log.rs | 2 +- .../crates/fluss/src/rpc/message/put_kv.rs | 73 ++++++++++ 18 files changed, 360 insertions(+), 66 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/rpc/message/put_kv.rs diff --git a/fluss-rust/bindings/python/src/table.rs b/fluss-rust/bindings/python/src/table.rs index db85c51f5a..773354e8d8 100644 --- a/fluss-rust/bindings/python/src/table.rs +++ b/fluss-rust/bindings/python/src/table.rs @@ -340,8 +340,7 @@ fn python_to_generic_row( .map(|n| n.to_string()) .unwrap_or_else(|_| "unknown".to_string()); FlussError::new_err(format!( - "Row must be a dict, list, or tuple; got {}", - type_name + "Row must be a dict, list, or tuple; got {type_name}" )) })?; let schema = table_info.row_type(); @@ -357,7 +356,7 @@ fn python_to_generic_row( .name() .map(|n| n.to_string()) .unwrap_or_else(|_| "unknown".to_string()); - FlussError::new_err(format!("Row dict keys must be strings; got {}", key_type)) + FlussError::new_err(format!("Row dict keys must be strings; got {key_type}")) })?; if fields.iter().all(|f| f.name() != key_str) { @@ -367,8 +366,7 @@ fn python_to_generic_row( .collect::>() .join(", "); return Err(FlussError::new_err(format!( - "Unknown field '{}'. Expected fields: {}", - key_str, expected + "Unknown field '{key_str}'. Expected fields: {expected}" ))); } } @@ -476,8 +474,7 @@ fn python_value_to_datum( } } _ => Err(FlussError::new_err(format!( - "Unsupported data type for row-level operations: {:?}", - data_type + "Unsupported data type for row-level operations: {data_type}" ))), } } diff --git a/fluss-rust/crates/fluss/build.rs b/fluss-rust/crates/fluss/build.rs index 1564313732..265208a7c3 100644 --- a/fluss-rust/crates/fluss/build.rs +++ b/fluss-rust/crates/fluss/build.rs @@ -19,7 +19,10 @@ use std::io::Result; fn main() -> Result<()> { let mut config = prost_build::Config::new(); - config.bytes([".proto.PbProduceLogReqForBucket.records"]); + config.bytes([ + ".proto.PbProduceLogReqForBucket.records", + ".proto.PbPutKvReqForBucket.records", + ]); config.compile_protos(&["src/proto/fluss_api.proto"], &["src/proto"])?; Ok(()) } diff --git a/fluss-rust/crates/fluss/src/client/table/append.rs b/fluss-rust/crates/fluss/src/client/table/append.rs index ad3e55e288..6d76f28b32 100644 --- a/fluss-rust/crates/fluss/src/client/table/append.rs +++ b/fluss-rust/crates/fluss/src/client/table/append.rs @@ -46,6 +46,7 @@ impl TableAppend { AppendWriter { table_path: Arc::new(self.table_path.clone()), writer_client: self.writer_client.clone(), + table_info: Arc::new(self.table_info.clone()), } } } @@ -53,18 +54,24 @@ impl TableAppend { pub struct AppendWriter { table_path: Arc, writer_client: Arc, + table_info: Arc, } impl AppendWriter { pub async fn append(&self, row: GenericRow<'_>) -> Result<()> { - let record = WriteRecord::new(self.table_path.clone(), row); + let record = + WriteRecord::for_append(self.table_path.clone(), self.table_info.schema_id, row); let result_handle = self.writer_client.send(&record).await?; let result = result_handle.wait().await?; result_handle.result(result) } pub async fn append_arrow_batch(&self, batch: RecordBatch) -> Result<()> { - let record = WriteRecord::new_record_batch(self.table_path.clone(), batch); + let record = WriteRecord::for_append_record_batch( + self.table_path.clone(), + self.table_info.schema_id, + batch, + ); let result_handle = self.writer_client.send(&record).await?; let result = result_handle.wait().await?; result_handle.result(result) diff --git a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs index fb6981f4b5..ac44cc1728 100644 --- a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs +++ b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs @@ -733,7 +733,7 @@ mod tests { let mut row = GenericRow::new(); row.set_field(0, 1_i32); row.set_field(1, "alice"); - let record = WriteRecord::new(table_path, row); + let record = WriteRecord::for_append(table_path, 1, row); builder.append(&record)?; let data = builder.build()?; diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index 3e7d61ff99..e9b2ce106d 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -1446,8 +1446,9 @@ mod tests { compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, }, ); - let record = WriteRecord::new( + let record = WriteRecord::for_append( table_path, + 1, GenericRow { values: vec![Datum::Int32(1)], }, diff --git a/fluss-rust/crates/fluss/src/client/table/writer.rs b/fluss-rust/crates/fluss/src/client/table/writer.rs index b2ba881b36..8a83b5e356 100644 --- a/fluss-rust/crates/fluss/src/client/table/writer.rs +++ b/fluss-rust/crates/fluss/src/client/table/writer.rs @@ -43,6 +43,7 @@ pub struct AbstractTableWriter { table_path: Arc, writer_client: Arc, field_count: i32, + schema_id: i32, } #[allow(dead_code)] @@ -57,6 +58,7 @@ impl AbstractTableWriter { table_path: Arc::new(table_path), writer_client, field_count: table_info.row_type().fields().len() as i32, + schema_id: table_info.schema_id, } } @@ -82,7 +84,8 @@ pub struct AppendWriterImpl { #[allow(dead_code)] impl AppendWriterImpl { pub async fn append(&self, row: GenericRow<'_>) -> Result<()> { - let record = WriteRecord::new(self.base.table_path.clone(), row); + let record = + WriteRecord::for_append(self.base.table_path.clone(), self.base.schema_id, row); self.base.send(&record).await } } diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs index 83f11ab782..0afc9d4bc1 100644 --- a/fluss-rust/crates/fluss/src/client/write/accumulator.rs +++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs @@ -17,7 +17,7 @@ use crate::client::write::batch::WriteBatch::ArrowLog; use crate::client::write::batch::{ArrowLogWriteBatch, WriteBatch}; -use crate::client::{Record, ResultHandle, WriteRecord}; +use crate::client::{LogWriteRecord, Record, ResultHandle, WriteRecord}; use crate::cluster::{BucketLocation, Cluster, ServerNode}; use crate::config::Config; use crate::error::Result; @@ -110,7 +110,7 @@ impl RecordAccumulator { row_type, bucket_id, current_time_ms(), - matches!(record.row, Record::RecordBatch(_)), + matches!(&record.record, Record::Log(LogWriteRecord::RecordBatch(_))), )); let batch_id = batch.batch_id(); @@ -541,8 +541,9 @@ mod tests { let accumulator = RecordAccumulator::new(config); let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); let cluster = Arc::new(build_cluster(table_path.as_ref(), 1, 1)); - let record = WriteRecord::new( + let record = WriteRecord::for_append( table_path.clone(), + 1, GenericRow { values: vec![Datum::Int32(1)], }, diff --git a/fluss-rust/crates/fluss/src/client/write/batch.rs b/fluss-rust/crates/fluss/src/client/write/batch.rs index 1f54226f39..01597538c8 100644 --- a/fluss-rust/crates/fluss/src/client/write/batch.rs +++ b/fluss-rust/crates/fluss/src/client/write/batch.rs @@ -17,13 +17,13 @@ use crate::BucketId; use crate::client::broadcast::{BatchWriteResult, BroadcastOnce}; -use crate::client::{ResultHandle, WriteRecord}; +use crate::client::{Record, ResultHandle, WriteRecord}; use crate::compression::ArrowCompressionInfo; -use crate::error::Result; -use crate::metadata::{DataType, TablePath}; +use crate::error::{Error, Result}; +use crate::metadata::{DataType, KvFormat, TablePath}; use crate::record::MemoryLogRecordsArrowBuilder; +use crate::record::kv::KvRecordBatchBuilder; use bytes::Bytes; -use parking_lot::Mutex; use std::cmp::max; use std::sync::atomic::{AtomicBool, AtomicI32, Ordering}; @@ -92,18 +92,28 @@ impl InnerWriteBatch { pub enum WriteBatch { ArrowLog(ArrowLogWriteBatch), + Kv(KvWriteBatch), } impl WriteBatch { pub fn inner_batch(&self) -> &InnerWriteBatch { match self { WriteBatch::ArrowLog(batch) => &batch.write_batch, + WriteBatch::Kv(batch) => &batch.write_batch, + } + } + + pub fn inner_batch_mut(&mut self) -> &mut InnerWriteBatch { + match self { + WriteBatch::ArrowLog(batch) => &mut batch.write_batch, + WriteBatch::Kv(batch) => &mut batch.write_batch, } } pub fn try_append(&mut self, write_record: &WriteRecord) -> Result> { match self { WriteBatch::ArrowLog(batch) => batch.try_append(write_record), + WriteBatch::Kv(batch) => batch.try_append(write_record), } } @@ -111,11 +121,13 @@ impl WriteBatch { self.inner_batch().waited_time_ms(now) } - pub fn close(&mut self) { + pub fn close(&mut self) -> Result<()> { match self { WriteBatch::ArrowLog(batch) => { batch.close(); + Ok(()) } + WriteBatch::Kv(batch) => batch.close(), } } @@ -127,20 +139,18 @@ impl WriteBatch { pub fn is_closed(&self) -> bool { match self { WriteBatch::ArrowLog(batch) => batch.is_closed(), + WriteBatch::Kv(batch) => batch.is_closed(), } } pub fn drained(&mut self, now_ms: i64) { - match self { - WriteBatch::ArrowLog(batch) => { - batch.write_batch.drained(now_ms); - } - } + self.inner_batch_mut().drained(now_ms); } - pub fn build(&self) -> Result { + pub fn build(&mut self) -> Result { match self { WriteBatch::ArrowLog(batch) => batch.build(), + WriteBatch::Kv(batch) => batch.build(), } } @@ -172,7 +182,7 @@ impl WriteBatch { pub struct ArrowLogWriteBatch { pub write_batch: InnerWriteBatch, pub arrow_builder: MemoryLogRecordsArrowBuilder, - built_records: Mutex>, + built_records: Option, } impl ArrowLogWriteBatch { @@ -196,7 +206,7 @@ impl ArrowLogWriteBatch { to_append_record_batch, arrow_compression_info, ), - built_records: Mutex::new(None), + built_records: None, } } @@ -218,13 +228,12 @@ impl ArrowLogWriteBatch { } } - pub fn build(&self) -> Result { - let mut cached = self.built_records.lock(); - if let Some(bytes) = cached.as_ref() { + pub fn build(&mut self) -> Result { + if let Some(bytes) = &self.built_records { return Ok(bytes.clone()); } let bytes = Bytes::from(self.arrow_builder.build()?); - *cached = Some(bytes.clone()); + self.built_records = Some(bytes.clone()); Ok(bytes) } @@ -237,6 +246,96 @@ impl ArrowLogWriteBatch { } } +pub struct KvWriteBatch { + write_batch: InnerWriteBatch, + kv_batch_builder: KvRecordBatchBuilder, + target_columns: Option>, + schema_id: i32, +} + +impl KvWriteBatch { + #[allow(clippy::too_many_arguments)] + pub fn new( + batch_id: i64, + table_path: TablePath, + schema_id: i32, + write_limit: usize, + kv_format: KvFormat, + bucket_id: BucketId, + target_columns: Option>, + create_ms: i64, + ) -> Self { + let base = InnerWriteBatch::new(batch_id, table_path, create_ms, bucket_id); + Self { + write_batch: base, + kv_batch_builder: KvRecordBatchBuilder::new(schema_id, write_limit, kv_format), + target_columns, + schema_id, + } + } + + pub fn try_append(&mut self, write_record: &WriteRecord) -> Result> { + let kv_write_record = match &write_record.record { + Record::Kv(record) => record, + _ => { + return Err(Error::UnsupportedOperation { + message: "Only KvRecord to append to KvWriteBatch ".to_string(), + }); + } + }; + + let key = kv_write_record.key; + + if self.schema_id != write_record.schema_id { + return Err(Error::UnexpectedError { + message: format!( + "schema id {} of the write record to append is not the same as the current schema id {} in the batch.", + write_record.schema_id, self.schema_id + ), + source: None, + }); + }; + + if self.target_columns.as_deref() != kv_write_record.target_columns { + return Err(Error::UnexpectedError { + message: format!( + "target columns {:?} of the write record to append are not the same as the current target columns {:?} in the batch.", + kv_write_record.target_columns, + self.target_columns.as_deref() + ), + source: None, + }); + } + + let row = kv_write_record.compacted_row.as_ref(); + + if self.is_closed() || !self.kv_batch_builder.has_room_for_row(key, row) { + Ok(None) + } else { + // append successfully + self.kv_batch_builder + .append_row(key, row) + .map_err(|e| Error::UnexpectedError { + message: "Failed to append row to KvWriteBatch".to_string(), + source: Some(Box::new(e)), + })?; + Ok(Some(ResultHandle::new(self.write_batch.results.receiver()))) + } + } + + pub fn build(&mut self) -> Result { + self.kv_batch_builder.build() + } + + pub fn is_closed(&self) -> bool { + self.kv_batch_builder.is_closed() + } + + pub fn close(&mut self) -> Result<()> { + self.kv_batch_builder.close() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/fluss-rust/crates/fluss/src/client/write/mod.rs b/fluss-rust/crates/fluss/src/client/write/mod.rs index 00a71c51fa..248218e076 100644 --- a/fluss-rust/crates/fluss/src/client/write/mod.rs +++ b/fluss-rust/crates/fluss/src/client/write/mod.rs @@ -21,7 +21,7 @@ mod batch; use crate::client::broadcast::{self as client_broadcast, BatchWriteResult, BroadcastOnceReceiver}; use crate::error::Error; use crate::metadata::TablePath; -use crate::row::GenericRow; +use crate::row::{CompactedRow, GenericRow}; pub use accumulator::*; use arrow::array::RecordBatch; use std::sync::Arc; @@ -36,28 +36,91 @@ mod writer_client; pub use write_format::WriteFormat; pub use writer_client::WriterClient; +#[allow(dead_code)] pub struct WriteRecord<'a> { - pub row: Record<'a>, - pub table_path: Arc, + record: Record<'a>, + table_path: Arc, + bucket_key: Option<&'a [u8]>, + schema_id: i32, + write_format: WriteFormat, +} + +impl<'a> WriteRecord<'a> { + pub fn record(&self) -> &Record<'a> { + &self.record + } } pub enum Record<'a> { - Row(GenericRow<'a>), + Log(LogWriteRecord<'a>), + Kv(KvWriteRecord<'a>), +} + +pub enum LogWriteRecord<'a> { + Generic(GenericRow<'a>), RecordBatch(Arc), } +pub struct KvWriteRecord<'a> { + // only valid for primary key table + key: &'a [u8], + target_columns: Option<&'a [usize]>, + compacted_row: Option>, +} + +impl<'a> KvWriteRecord<'a> { + fn new( + key: &'a [u8], + target_columns: Option<&'a [usize]>, + compacted_row: Option>, + ) -> Self { + KvWriteRecord { + key, + target_columns, + compacted_row, + } + } +} + impl<'a> WriteRecord<'a> { - pub fn new(table_path: Arc, row: GenericRow<'a>) -> Self { + pub fn for_append(table_path: Arc, schema_id: i32, row: GenericRow<'a>) -> Self { + Self { + record: Record::Log(LogWriteRecord::Generic(row)), + table_path, + bucket_key: None, + schema_id, + write_format: WriteFormat::ArrowLog, + } + } + + pub fn for_append_record_batch( + table_path: Arc, + schema_id: i32, + row: RecordBatch, + ) -> Self { Self { - row: Record::Row(row), + record: Record::Log(LogWriteRecord::RecordBatch(Arc::new(row))), table_path, + bucket_key: None, + schema_id, + write_format: WriteFormat::ArrowLog, } } - pub fn new_record_batch(table_path: Arc, row: RecordBatch) -> Self { + pub fn for_upsert( + table_path: Arc, + schema_id: i32, + bucket_key: &'a [u8], + key: &'a [u8], + target_columns: Option<&'a [usize]>, + row: CompactedRow<'a>, + ) -> Self { Self { - row: Record::RecordBatch(Arc::new(row)), + record: Record::Kv(KvWriteRecord::new(key, target_columns, Some(row))), table_path, + bucket_key: Some(bucket_key), + schema_id, + write_format: WriteFormat::CompactedKv, } } } diff --git a/fluss-rust/crates/fluss/src/client/write/sender.rs b/fluss-rust/crates/fluss/src/client/write/sender.rs index ffac0af8ae..7ea24e30f6 100644 --- a/fluss-rust/crates/fluss/src/client/write/sender.rs +++ b/fluss-rust/crates/fluss/src/client/write/sender.rs @@ -178,9 +178,9 @@ impl Sender { }; for (table_id, table_buckets) in write_batch_by_table { - let request_batches: Vec<&ReadyWriteBatch> = table_buckets + let mut request_batches: Vec = table_buckets .iter() - .filter_map(|bucket| records_by_bucket.get(bucket)) + .filter_map(|bucket| records_by_bucket.remove(bucket)) .collect(); if request_batches.is_empty() { continue; @@ -189,7 +189,7 @@ impl Sender { table_id, acks, self.max_request_timeout_ms, - request_batches.as_slice(), + &mut request_batches, ) { Ok(request) => request, Err(e) => { @@ -205,6 +205,12 @@ impl Sender { } }; + // let's put in back into records_by_bucket + // since response handle will use it. + for request_batch in request_batches { + records_by_bucket.insert(request_batch.table_bucket.clone(), request_batch); + } + let response = match connection.request(request).await { Ok(response) => response, Err(e) => { @@ -462,8 +468,9 @@ mod tests { cluster: Arc, table_path: Arc, ) -> Result<(ReadyWriteBatch, crate::client::ResultHandle)> { - let record = WriteRecord::new( + let record = WriteRecord::for_append( table_path, + 1, GenericRow { values: vec![Datum::Int32(1)], }, diff --git a/fluss-rust/crates/fluss/src/proto/fluss_api.proto b/fluss-rust/crates/fluss/src/proto/fluss_api.proto index b4ae8405aa..eaee94c36c 100644 --- a/fluss-rust/crates/fluss/src/proto/fluss_api.proto +++ b/fluss-rust/crates/fluss/src/proto/fluss_api.proto @@ -119,6 +119,34 @@ message PbProduceLogRespForBucket { optional int64 base_offset = 5; } +// put kv request and response +message PutKvRequest { + required int32 acks = 1; + required int64 table_id = 2; + required int32 timeout_ms = 3; + // the indexes for the columns to write, + // if empty, means write all columns + repeated int32 target_columns = 4 [packed = true]; + repeated PbPutKvReqForBucket buckets_req = 5; +} + +message PutKvResponse { + repeated PbPutKvRespForBucket buckets_resp = 1; +} + +message PbPutKvReqForBucket { + optional int64 partition_id = 1; + required int32 bucket_id = 2; + required bytes records = 3; +} + +message PbPutKvRespForBucket { + optional int64 partition_id = 1; + required int32 bucket_id = 2; + optional int32 error_code = 3; + optional string error_message = 4; +} + message CreateTableRequest { required PbTablePath table_path = 1; required bytes table_json = 2; diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index b331ae9d7f..aa48376bb6 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::client::{Record, WriteRecord}; +use crate::client::{LogWriteRecord, Record, WriteRecord}; use crate::compression::ArrowCompressionInfo; use crate::error::{Error, Result}; use crate::metadata::DataType; @@ -275,11 +275,16 @@ impl MemoryLogRecordsArrowBuilder { } pub fn append(&mut self, record: &WriteRecord) -> Result { - match &record.row { - Record::Row(row) => Ok(self.arrow_record_batch_builder.append(row)?), - Record::RecordBatch(record_batch) => Ok(self - .arrow_record_batch_builder - .append_batch(record_batch.clone())?), + match &record.record() { + Record::Log(log_write_record) => match log_write_record { + LogWriteRecord::Generic(row) => Ok(self.arrow_record_batch_builder.append(row)?), + LogWriteRecord::RecordBatch(record_batch) => Ok(self + .arrow_record_batch_builder + .append_batch(record_batch.clone())?), + }, + Record::Kv(_) => Err(Error::UnsupportedOperation { + message: "Only LogRecord is supported to append".to_string(), + }), } // todo: consider write other change type } diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs index 636104d176..e3da8640f7 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs @@ -19,9 +19,7 @@ //! //! This module provides the KvRecordBatchBuilder for building batches of KV records. -use bytes::{Bytes, BytesMut}; -use std::io; - +use crate::error::{Error, Result}; use crate::metadata::KvFormat; use crate::record::kv::kv_record::KvRecord; use crate::record::kv::kv_record_batch::{ @@ -31,6 +29,8 @@ use crate::record::kv::kv_record_batch::{ }; use crate::record::kv::{CURRENT_KV_MAGIC_VALUE, NO_BATCH_SEQUENCE, NO_WRITER_ID}; use crate::row::BinaryRow; +use bytes::{Bytes, BytesMut}; +use std::io; /// Builder for KvRecordBatch. /// @@ -185,11 +185,12 @@ impl KvRecordBatchBuilder { /// built bytes may change if mutations occur between builds. /// /// Note: [`close`](Self::close) prevents further appends but does not prevent writer state modifications. - pub fn build(&mut self) -> io::Result { + pub fn build(&mut self) -> Result { if self.aborted { - return Err(io::Error::other( - "Attempting to build an aborted record batch", - )); + return Err(Error::UnexpectedError { + message: "Attempting to build an aborted record batch".to_string(), + source: None, + }); } if let Some(ref cached) = self.built_buffer { @@ -225,11 +226,13 @@ impl KvRecordBatchBuilder { /// Close the builder. /// After closing, no more records can be appended, but the batch can still be built. - pub fn close(&mut self) -> io::Result<()> { + pub fn close(&mut self) -> Result<()> { if self.aborted { - return Err(io::Error::other( - "Cannot close KvRecordBatchBuilder as it has already been aborted", - )); + return Err(Error::UnexpectedError { + message: "Cannot close KvRecordBatchBuilder as it has already been aborted" + .to_string(), + source: None, + }); } self.is_closed = true; Ok(()) diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs index 2049c32680..fe6c6f0598 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs @@ -90,8 +90,7 @@ impl ReadContext for KvRecordReadContext { other => { return Err(Error::IoUnexpectedError { message: format!( - "Schema {} has invalid row type: expected Row, got {:?}", - schema_id, other + "Schema {schema_id} has invalid row type: expected Row, got {other:?}" ), source: std::io::Error::new( std::io::ErrorKind::InvalidData, @@ -134,7 +133,7 @@ mod tests { fn new(data_types: Vec) -> Self { let mut builder = Schema::builder(); for (i, dt) in data_types.iter().enumerate() { - builder = builder.column(&format!("field{}", i), dt.clone()); + builder = builder.column(&format!("field{i}"), dt.clone()); } let schema = builder.build().expect("Failed to build schema"); diff --git a/fluss-rust/crates/fluss/src/rpc/api_key.rs b/fluss-rust/crates/fluss/src/rpc/api_key.rs index 9f9268e857..66e4beb8e4 100644 --- a/fluss-rust/crates/fluss/src/rpc/api_key.rs +++ b/fluss-rust/crates/fluss/src/rpc/api_key.rs @@ -30,6 +30,7 @@ pub enum ApiKey { TableExists, MetaData, ProduceLog, + PutKv, FetchLog, Lookup, ListOffsets, @@ -54,6 +55,7 @@ impl From for ApiKey { 1012 => ApiKey::MetaData, 1014 => ApiKey::ProduceLog, 1015 => ApiKey::FetchLog, + 1016 => ApiKey::PutKv, 1017 => ApiKey::Lookup, 1021 => ApiKey::ListOffsets, 1025 => ApiKey::GetFileSystemSecurityToken, @@ -79,6 +81,7 @@ impl From for i16 { ApiKey::MetaData => 1012, ApiKey::ProduceLog => 1014, ApiKey::FetchLog => 1015, + ApiKey::PutKv => 1016, ApiKey::Lookup => 1017, ApiKey::ListOffsets => 1021, ApiKey::GetFileSystemSecurityToken => 1025, @@ -108,6 +111,7 @@ mod tests { (1012, ApiKey::MetaData), (1014, ApiKey::ProduceLog), (1015, ApiKey::FetchLog), + (1016, ApiKey::PutKv), (1017, ApiKey::Lookup), (1021, ApiKey::ListOffsets), (1025, ApiKey::GetFileSystemSecurityToken), diff --git a/fluss-rust/crates/fluss/src/rpc/message/mod.rs b/fluss-rust/crates/fluss/src/rpc/message/mod.rs index 2fe506bc37..4e6c8e1eaf 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/mod.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/mod.rs @@ -36,6 +36,7 @@ mod list_offsets; mod list_tables; mod lookup; mod produce_log; +mod put_kv; mod table_exists; mod update_metadata; diff --git a/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs b/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs index eb725751c7..dab7ea9a57 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs @@ -37,7 +37,7 @@ impl ProduceLogRequest { table_id: i64, ack: i16, max_request_timeout_ms: i32, - ready_batches: &[&ReadyWriteBatch], + ready_batches: &mut [ReadyWriteBatch], ) -> FlussResult { let mut request = proto::ProduceLogRequest { table_id, diff --git a/fluss-rust/crates/fluss/src/rpc/message/put_kv.rs b/fluss-rust/crates/fluss/src/rpc/message/put_kv.rs new file mode 100644 index 0000000000..983faa6646 --- /dev/null +++ b/fluss-rust/crates/fluss/src/rpc/message/put_kv.rs @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +use crate::client::ReadyWriteBatch; +use crate::proto::{PbPutKvReqForBucket, PutKvResponse}; +use crate::rpc::api_key::ApiKey; +use crate::rpc::api_version::ApiVersion; +use crate::rpc::frame::ReadError; +use crate::rpc::frame::WriteError; +use crate::rpc::message::{ReadVersionedType, RequestBody, WriteVersionedType}; +use crate::{impl_read_version_type, impl_write_version_type, proto}; +use bytes::{Buf, BufMut}; +use prost::Message; + +#[allow(dead_code)] +pub struct PutKvRequest { + pub inner_request: proto::PutKvRequest, +} + +#[allow(dead_code)] +impl PutKvRequest { + pub fn new( + table_id: i64, + ack: i16, + max_request_timeout_ms: i32, + target_columns: Vec, + ready_batches: &mut [ReadyWriteBatch], + ) -> crate::error::Result { + let mut request = proto::PutKvRequest { + table_id, + acks: ack as i32, + timeout_ms: max_request_timeout_ms, + target_columns, + ..Default::default() + }; + for ready_batch in ready_batches { + request.buckets_req.push(PbPutKvReqForBucket { + partition_id: ready_batch.table_bucket.partition_id(), + bucket_id: ready_batch.table_bucket.bucket_id(), + records: ready_batch.write_batch.build()?, + }) + } + + Ok(PutKvRequest { + inner_request: request, + }) + } +} + +impl RequestBody for PutKvRequest { + type ResponseBody = PutKvResponse; + + const API_KEY: ApiKey = ApiKey::PutKv; + + const REQUEST_VERSION: ApiVersion = ApiVersion(0); +} + +impl_write_version_type!(PutKvRequest); +impl_read_version_type!(PutKvResponse); From db565f97e671cb81e7bd40e1eda64f6d527be440 Mon Sep 17 00:00:00 2001 From: Anton Borisov <148864665+fresh-borzoni@users.noreply.github.com> Date: Mon, 19 Jan 2026 13:27:22 +0000 Subject: [PATCH 075/287] feat: support all basic datatypes in compacted key encoder(continuation) (#175) --- fluss-rust/crates/fluss/Cargo.toml | 2 +- .../crates/fluss/src/metadata/datatype.rs | 272 +++++++--- .../crates/fluss/src/metadata/json_serde.rs | 112 +++- fluss-rust/crates/fluss/src/record/arrow.rs | 21 +- .../fluss/src/row/binary/binary_writer.rs | 57 ++- fluss-rust/crates/fluss/src/row/column.rs | 198 +++++++- .../src/row/compacted/compacted_key_writer.rs | 7 + .../fluss/src/row/compacted/compacted_row.rs | 147 ++++-- .../src/row/compacted/compacted_row_reader.rs | 72 ++- .../src/row/compacted/compacted_row_writer.rs | 125 ++++- fluss-rust/crates/fluss/src/row/datum.rs | 181 ++++++- fluss-rust/crates/fluss/src/row/decimal.rs | 477 ++++++++++++++++++ .../src/row/encode/compacted_key_encoder.rs | 175 ++++--- .../crates/fluss/src/row/field_getter.rs | 101 +++- fluss-rust/crates/fluss/src/row/mod.rs | 63 ++- 15 files changed, 1739 insertions(+), 271 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/row/decimal.rs diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index 8942ffc7db..c3bdd4475a 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -48,7 +48,7 @@ tokio = { workspace = true } parking_lot = "0.12" bytes = "1.10.1" dashmap = "6.1.0" -rust_decimal = "1" +bigdecimal = { version = "0.4", features = ["serde"] } ordered-float = { version = "5", features = ["serde"] } parse-display = "0.10" ref-cast = "1.0" diff --git a/fluss-rust/crates/fluss/src/metadata/datatype.rs b/fluss-rust/crates/fluss/src/metadata/datatype.rs index f1574665eb..e365237030 100644 --- a/fluss-rust/crates/fluss/src/metadata/datatype.rs +++ b/fluss-rust/crates/fluss/src/metadata/datatype.rs @@ -453,16 +453,40 @@ impl DecimalType { pub const DEFAULT_SCALE: u32 = 0; - pub fn new(precision: u32, scale: u32) -> Self { + pub fn new(precision: u32, scale: u32) -> Result { Self::with_nullable(true, precision, scale) } - pub fn with_nullable(nullable: bool, precision: u32, scale: u32) -> Self { - DecimalType { + /// Create a DecimalType with validation, returning an error if parameters are invalid. + pub fn with_nullable(nullable: bool, precision: u32, scale: u32) -> Result { + // Validate precision + if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) { + return Err(IllegalArgument { + message: format!( + "Decimal precision must be between {} and {} (both inclusive), got: {}", + Self::MIN_PRECISION, + Self::MAX_PRECISION, + precision + ), + }); + } + // Validate scale + // Note: MIN_SCALE is 0, and scale is u32, so scale >= MIN_SCALE is always true + if scale > precision { + return Err(IllegalArgument { + message: format!( + "Decimal scale must be between {} and the precision {} (both inclusive), got: {}", + Self::MIN_SCALE, + precision, + scale + ), + }); + } + Ok(DecimalType { nullable, precision, scale, - } + }) } pub fn precision(&self) -> u32 { @@ -475,6 +499,7 @@ impl DecimalType { pub fn as_non_nullable(&self) -> Self { Self::with_nullable(false, self.precision, self.scale) + .expect("Invalid decimal precision or scale") } } @@ -531,7 +556,7 @@ pub struct TimeType { impl TimeType { fn default() -> Self { - Self::new(Self::DEFAULT_PRECISION) + Self::new(Self::DEFAULT_PRECISION).expect("Invalid default time precision") } } @@ -542,15 +567,27 @@ impl TimeType { pub const DEFAULT_PRECISION: u32 = 0; - pub fn new(precision: u32) -> Self { + pub fn new(precision: u32) -> Result { Self::with_nullable(true, precision) } - pub fn with_nullable(nullable: bool, precision: u32) -> Self { - TimeType { + /// Create a TimeType with validation, returning an error if precision is invalid. + pub fn with_nullable(nullable: bool, precision: u32) -> Result { + // Validate precision + if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) { + return Err(IllegalArgument { + message: format!( + "Time precision must be between {} and {} (both inclusive), got: {}", + Self::MIN_PRECISION, + Self::MAX_PRECISION, + precision + ), + }); + } + Ok(TimeType { nullable, precision, - } + }) } pub fn precision(&self) -> u32 { @@ -558,7 +595,7 @@ impl TimeType { } pub fn as_non_nullable(&self) -> Self { - Self::with_nullable(false, self.precision) + Self::with_nullable(false, self.precision).expect("Invalid time precision") } } @@ -580,7 +617,7 @@ pub struct TimestampType { impl Default for TimestampType { fn default() -> Self { - Self::new(Self::DEFAULT_PRECISION) + Self::new(Self::DEFAULT_PRECISION).expect("Invalid default timestamp precision") } } @@ -591,15 +628,27 @@ impl TimestampType { pub const DEFAULT_PRECISION: u32 = 6; - pub fn new(precision: u32) -> Self { + pub fn new(precision: u32) -> Result { Self::with_nullable(true, precision) } - pub fn with_nullable(nullable: bool, precision: u32) -> Self { - TimestampType { + /// Create a TimestampType with validation, returning an error if precision is invalid. + pub fn with_nullable(nullable: bool, precision: u32) -> Result { + // Validate precision + if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) { + return Err(IllegalArgument { + message: format!( + "Timestamp precision must be between {} and {} (both inclusive), got: {}", + Self::MIN_PRECISION, + Self::MAX_PRECISION, + precision + ), + }); + } + Ok(TimestampType { nullable, precision, - } + }) } pub fn precision(&self) -> u32 { @@ -607,7 +656,7 @@ impl TimestampType { } pub fn as_non_nullable(&self) -> Self { - Self::with_nullable(false, self.precision) + Self::with_nullable(false, self.precision).expect("Invalid timestamp precision") } } @@ -630,6 +679,7 @@ pub struct TimestampLTzType { impl Default for TimestampLTzType { fn default() -> Self { Self::new(Self::DEFAULT_PRECISION) + .expect("Invalid default timestamp with local time zone precision") } } @@ -640,15 +690,27 @@ impl TimestampLTzType { pub const DEFAULT_PRECISION: u32 = 6; - pub fn new(precision: u32) -> Self { + pub fn new(precision: u32) -> Result { Self::with_nullable(true, precision) } - pub fn with_nullable(nullable: bool, precision: u32) -> Self { - TimestampLTzType { + /// Create a TimestampLTzType with validation, returning an error if precision is invalid. + pub fn with_nullable(nullable: bool, precision: u32) -> Result { + // Validate precision + if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) { + return Err(IllegalArgument { + message: format!( + "Timestamp with local time zone precision must be between {} and {} (both inclusive), got: {}", + Self::MIN_PRECISION, + Self::MAX_PRECISION, + precision + ), + }); + } + Ok(TimestampLTzType { nullable, precision, - } + }) } pub fn precision(&self) -> u32 { @@ -657,6 +719,7 @@ impl TimestampLTzType { pub fn as_non_nullable(&self) -> Self { Self::with_nullable(false, self.precision) + .expect("Invalid timestamp with local time zone precision") } } @@ -985,7 +1048,7 @@ impl DataTypes { /// digits to the right of the decimal point in a number (=scale). `p` must have a value /// between 1 and 38 (both inclusive). `s` must have a value between 0 and `p` (both inclusive). pub fn decimal(precision: u32, scale: u32) -> DataType { - DataType::Decimal(DecimalType::new(precision, scale)) + DataType::Decimal(DecimalType::new(precision, scale).expect("Invalid decimal parameters")) } pub fn date() -> DataType { @@ -1000,7 +1063,7 @@ impl DataTypes { /// Data type of a time WITHOUT time zone `TIME(p)` where `p` is the number of digits /// of fractional seconds (=precision). `p` must have a value between 0 and 9 (both inclusive). pub fn time_with_precision(precision: u32) -> DataType { - DataType::Time(TimeType::new(precision)) + DataType::Time(TimeType::new(precision).expect("Invalid time precision")) } /// Data type of a timestamp WITHOUT time zone `TIMESTAMP` with 6 digits of fractional @@ -1013,7 +1076,7 @@ impl DataTypes { /// of digits of fractional seconds (=precision). `p` must have a value between 0 and 9 /// (both inclusive). pub fn timestamp_with_precision(precision: u32) -> DataType { - DataType::Timestamp(TimestampType::new(precision)) + DataType::Timestamp(TimestampType::new(precision).expect("Invalid timestamp precision")) } /// Data type of a timestamp WITH time zone `TIMESTAMP WITH TIME ZONE` with 6 digits of @@ -1025,7 +1088,10 @@ impl DataTypes { /// Data type of a timestamp WITH time zone `TIMESTAMP WITH TIME ZONE(p)` where `p` is the number /// of digits of fractional seconds (=precision). `p` must have a value between 0 and 9 (both inclusive). pub fn timestamp_ltz_with_precision(precision: u32) -> DataType { - DataType::TimestampLTz(TimestampLTzType::new(precision)) + DataType::TimestampLTz( + TimestampLTzType::new(precision) + .expect("Invalid timestamp with local time zone precision"), + ) } /// Data type of an array of elements with same subtype. @@ -1100,82 +1166,56 @@ impl Display for DataField { } #[test] -fn test_boolean_display() { +fn test_primitive_types_display() { + // Test simple primitive types with nullable and non-nullable variants assert_eq!(BooleanType::new().to_string(), "BOOLEAN"); assert_eq!( BooleanType::with_nullable(false).to_string(), "BOOLEAN NOT NULL" ); -} -#[test] -fn test_tinyint_display() { assert_eq!(TinyIntType::new().to_string(), "TINYINT"); assert_eq!( TinyIntType::with_nullable(false).to_string(), "TINYINT NOT NULL" ); -} -#[test] -fn test_smallint_display() { assert_eq!(SmallIntType::new().to_string(), "SMALLINT"); assert_eq!( SmallIntType::with_nullable(false).to_string(), "SMALLINT NOT NULL" ); -} -#[test] -fn test_int_display() { assert_eq!(IntType::new().to_string(), "INT"); assert_eq!(IntType::with_nullable(false).to_string(), "INT NOT NULL"); -} -#[test] -fn test_bigint_display() { assert_eq!(BigIntType::new().to_string(), "BIGINT"); assert_eq!( BigIntType::with_nullable(false).to_string(), "BIGINT NOT NULL" ); -} -#[test] -fn test_float_display() { assert_eq!(FloatType::new().to_string(), "FLOAT"); assert_eq!( FloatType::with_nullable(false).to_string(), "FLOAT NOT NULL" ); -} -#[test] -fn test_double_display() { assert_eq!(DoubleType::new().to_string(), "DOUBLE"); assert_eq!( DoubleType::with_nullable(false).to_string(), "DOUBLE NOT NULL" ); -} -#[test] -fn test_string_display() { assert_eq!(StringType::new().to_string(), "STRING"); assert_eq!( StringType::with_nullable(false).to_string(), "STRING NOT NULL" ); -} -#[test] -fn test_date_display() { assert_eq!(DateType::new().to_string(), "DATE"); assert_eq!(DateType::with_nullable(false).to_string(), "DATE NOT NULL"); -} -#[test] -fn test_bytes_display() { assert_eq!(BytesType::new().to_string(), "BYTES"); assert_eq!( BytesType::with_nullable(false).to_string(), @@ -1184,59 +1224,58 @@ fn test_bytes_display() { } #[test] -fn test_char_display() { +fn test_parameterized_types_display() { + // Test types with parameters (length, precision, scale, etc.) assert_eq!(CharType::new(10).to_string(), "CHAR(10)"); assert_eq!( CharType::with_nullable(20, false).to_string(), "CHAR(20) NOT NULL" ); -} -#[test] -fn test_decimal_display() { - assert_eq!(DecimalType::new(10, 2).to_string(), "DECIMAL(10, 2)"); + assert_eq!(BinaryType::new(100).to_string(), "BINARY(100)"); + assert_eq!( + BinaryType::with_nullable(false, 256).to_string(), + "BINARY(256) NOT NULL" + ); + assert_eq!( - DecimalType::with_nullable(false, 38, 10).to_string(), + DecimalType::new(10, 2).unwrap().to_string(), + "DECIMAL(10, 2)" + ); + assert_eq!( + DecimalType::with_nullable(false, 38, 10) + .unwrap() + .to_string(), "DECIMAL(38, 10) NOT NULL" ); -} -#[test] -fn test_time_display() { - assert_eq!(TimeType::new(0).to_string(), "TIME(0)"); - assert_eq!(TimeType::new(3).to_string(), "TIME(3)"); + assert_eq!(TimeType::new(0).unwrap().to_string(), "TIME(0)"); + assert_eq!(TimeType::new(3).unwrap().to_string(), "TIME(3)"); assert_eq!( - TimeType::with_nullable(false, 9).to_string(), + TimeType::with_nullable(false, 9).unwrap().to_string(), "TIME(9) NOT NULL" ); -} -#[test] -fn test_timestamp_display() { - assert_eq!(TimestampType::new(6).to_string(), "TIMESTAMP(6)"); - assert_eq!(TimestampType::new(0).to_string(), "TIMESTAMP(0)"); + assert_eq!(TimestampType::new(6).unwrap().to_string(), "TIMESTAMP(6)"); + assert_eq!(TimestampType::new(0).unwrap().to_string(), "TIMESTAMP(0)"); assert_eq!( - TimestampType::with_nullable(false, 9).to_string(), + TimestampType::with_nullable(false, 9).unwrap().to_string(), "TIMESTAMP(9) NOT NULL" ); -} -#[test] -fn test_timestamp_ltz_display() { - assert_eq!(TimestampLTzType::new(6).to_string(), "TIMESTAMP_LTZ(6)"); - assert_eq!(TimestampLTzType::new(3).to_string(), "TIMESTAMP_LTZ(3)"); assert_eq!( - TimestampLTzType::with_nullable(false, 9).to_string(), - "TIMESTAMP_LTZ(9) NOT NULL" + TimestampLTzType::new(6).unwrap().to_string(), + "TIMESTAMP_LTZ(6)" ); -} - -#[test] -fn test_binary_display() { - assert_eq!(BinaryType::new(100).to_string(), "BINARY(100)"); assert_eq!( - BinaryType::with_nullable(false, 256).to_string(), - "BINARY(256) NOT NULL" + TimestampLTzType::new(3).unwrap().to_string(), + "TIMESTAMP_LTZ(3)" + ); + assert_eq!( + TimestampLTzType::with_nullable(false, 9) + .unwrap() + .to_string(), + "TIMESTAMP_LTZ(9) NOT NULL" ); } @@ -1352,3 +1391,68 @@ fn test_deeply_nested_types() { )); assert_eq!(nested.to_string(), "ARRAY>>"); } + +#[test] +fn test_decimal_invalid_precision() { + // DecimalType::with_nullable should return an error for invalid precision + let result = DecimalType::with_nullable(true, 50, 2); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Decimal precision must be between 1 and 38") + ); +} + +#[test] +fn test_decimal_invalid_scale() { + // DecimalType::with_nullable should return an error when scale > precision + let result = DecimalType::with_nullable(true, 10, 15); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Decimal scale must be between 0 and the precision 10") + ); +} + +#[test] +fn test_time_invalid_precision() { + // TimeType::with_nullable should return an error for invalid precision + let result = TimeType::with_nullable(true, 10); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Time precision must be between 0 and 9") + ); +} + +#[test] +fn test_timestamp_invalid_precision() { + // TimestampType::with_nullable should return an error for invalid precision + let result = TimestampType::with_nullable(true, 10); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Timestamp precision must be between 0 and 9") + ); +} + +#[test] +fn test_timestamp_ltz_invalid_precision() { + // TimestampLTzType::with_nullable should return an error for invalid precision + let result = TimestampLTzType::with_nullable(true, 10); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Timestamp with local time zone precision must be between 0 and 9") + ); +} diff --git a/fluss-rust/crates/fluss/src/metadata/json_serde.rs b/fluss-rust/crates/fluss/src/metadata/json_serde.rs index 7d94e194e2..faa5583bed 100644 --- a/fluss-rust/crates/fluss/src/metadata/json_serde.rs +++ b/fluss-rust/crates/fluss/src/metadata/json_serde.rs @@ -202,7 +202,12 @@ impl JsonSerde for DataType { .get(Self::FIELD_NAME_SCALE) .and_then(|v| v.as_u64()) .unwrap_or(0) as u32; - DataTypes::decimal(precision, scale) + DataType::Decimal( + crate::metadata::datatype::DecimalType::with_nullable(true, precision, scale) + .map_err(|e| Error::JsonSerdeError { + message: format!("Invalid DECIMAL parameters: {}", e), + })?, + ) } "DATE" => DataTypes::date(), "TIME_WITHOUT_TIME_ZONE" => { @@ -210,21 +215,43 @@ impl JsonSerde for DataType { .get(Self::FIELD_NAME_PRECISION) .and_then(|v| v.as_u64()) .unwrap_or(0) as u32; - DataTypes::time_with_precision(precision) + DataType::Time( + crate::metadata::datatype::TimeType::with_nullable(true, precision).map_err( + |e| Error::JsonSerdeError { + message: format!("Invalid TIME_WITHOUT_TIME_ZONE precision: {}", e), + }, + )?, + ) } "TIMESTAMP_WITHOUT_TIME_ZONE" => { let precision = node .get(Self::FIELD_NAME_PRECISION) .and_then(|v| v.as_u64()) .unwrap_or(6) as u32; - DataTypes::timestamp_with_precision(precision) + DataType::Timestamp( + crate::metadata::datatype::TimestampType::with_nullable(true, precision) + .map_err(|e| Error::JsonSerdeError { + message: format!( + "Invalid TIMESTAMP_WITHOUT_TIME_ZONE precision: {}", + e + ), + })?, + ) } "TIMESTAMP_WITH_LOCAL_TIME_ZONE" => { let precision = node .get(Self::FIELD_NAME_PRECISION) .and_then(|v| v.as_u64()) .unwrap_or(6) as u32; - DataTypes::timestamp_ltz_with_precision(precision) + DataType::TimestampLTz( + crate::metadata::datatype::TimestampLTzType::with_nullable(true, precision) + .map_err(|e| Error::JsonSerdeError { + message: format!( + "Invalid TIMESTAMP_WITH_LOCAL_TIME_ZONE precision: {}", + e + ), + })?, + ) } "BYTES" => DataTypes::bytes(), "BINARY" => { @@ -689,4 +716,81 @@ mod tests { assert_eq!(dt, deserialized); } } + + #[test] + fn test_invalid_datatype_validation() { + use serde_json::json; + + // Invalid DECIMAL precision (> 38) + let invalid_decimal = json!({ + "type": "DECIMAL", + "precision": 50, + "scale": 2 + }); + let result = DataType::deserialize_json(&invalid_decimal); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Invalid DECIMAL parameters") + ); + + // Invalid TIME precision (> 9) + let invalid_time = json!({ + "type": "TIME_WITHOUT_TIME_ZONE", + "precision": 15 + }); + let result = DataType::deserialize_json(&invalid_time); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Invalid TIME_WITHOUT_TIME_ZONE precision") + ); + + // Invalid TIMESTAMP precision (> 9) + let invalid_timestamp = json!({ + "type": "TIMESTAMP_WITHOUT_TIME_ZONE", + "precision": 20 + }); + let result = DataType::deserialize_json(&invalid_timestamp); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Invalid TIMESTAMP_WITHOUT_TIME_ZONE precision") + ); + + // Invalid TIMESTAMP_LTZ precision (> 9) + let invalid_timestamp_ltz = json!({ + "type": "TIMESTAMP_WITH_LOCAL_TIME_ZONE", + "precision": 10 + }); + let result = DataType::deserialize_json(&invalid_timestamp_ltz); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Invalid TIMESTAMP_WITH_LOCAL_TIME_ZONE precision") + ); + + // Invalid DECIMAL scale (> precision) + let invalid_decimal_scale = json!({ + "type": "DECIMAL", + "precision": 10, + "scale": 15 + }); + let result = DataType::deserialize_json(&invalid_decimal_scale); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Invalid DECIMAL parameters") + ); + } } diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index aa48376bb6..3c46f9b5cd 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -1061,8 +1061,7 @@ pub struct MyVec(pub StreamReader); mod tests { use super::*; use crate::error::Error; - use crate::metadata::DataField; - use crate::metadata::DataTypes; + use crate::metadata::{DataField, DataTypes}; #[test] fn test_to_array_type() { @@ -1166,24 +1165,6 @@ mod tests { ); } - #[test] - #[should_panic(expected = "Invalid precision value for TimeType: 10")] - fn test_time_invalid_precision() { - to_arrow_type(&DataTypes::time_with_precision(10)); - } - - #[test] - #[should_panic(expected = "Invalid precision value for TimestampType: 10")] - fn test_timestamp_invalid_precision() { - to_arrow_type(&DataTypes::timestamp_with_precision(10)); - } - - #[test] - #[should_panic(expected = "Invalid precision value for TimestampLTzType: 10")] - fn test_timestamp_ltz_invalid_precision() { - to_arrow_type(&DataTypes::timestamp_ltz_with_precision(10)); - } - #[test] fn test_parse_ipc_message() { let empty_body: &[u8] = &le_bytes(&[0xFFFFFFFF, 0x00000000]); diff --git a/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs b/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs index 9917c7b76a..af2765c445 100644 --- a/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs +++ b/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs @@ -52,14 +52,20 @@ pub trait BinaryWriter { fn write_binary(&mut self, bytes: &[u8], length: usize); - // TODO Decimal type - // fn write_decimal(&mut self, pos: i32, value: f64); + fn write_decimal(&mut self, value: &crate::row::Decimal, precision: u32); - // TODO Timestamp type - // fn write_timestamp_ntz(&mut self, pos: i32, value: i64); + /// Writes a TIME value. + /// + /// Note: TIME is physically stored as an i32 (milliseconds since midnight). + /// This method exists for type safety and semantic clarity, even though it's + /// currently equivalent to `write_int()`. The precision parameter is accepted + /// for API consistency with TIMESTAMP types, though TIME encoding doesn't + /// currently vary by precision. + fn write_time(&mut self, value: i32, precision: u32); - // TODO Timestamp type - // fn write_timestamp_ltz(&mut self, pos: i32, value: i64); + fn write_timestamp_ntz(&mut self, value: &crate::row::datum::TimestampNtz, precision: u32); + + fn write_timestamp_ltz(&mut self, value: &crate::row::datum::TimestampLtz, precision: u32); // TODO InternalArray, ArraySerializer // fn write_array(&mut self, pos: i32, value: i64); @@ -125,7 +131,12 @@ pub enum InnerValueWriter { BigInt, Float, Double, - // TODO Decimal, Date, TimeWithoutTimeZone, TimestampWithoutTimeZone, TimestampWithLocalTimeZone, Array, Row + Decimal(u32, u32), // precision, scale + Date, + Time(u32), // precision (not used in wire format, but kept for consistency) + TimestampNtz(u32), // precision + TimestampLtz(u32), // precision + // TODO Array, Row } /// Accessor for writing the fields/elements of a binary writer during runtime, the @@ -147,6 +158,23 @@ impl InnerValueWriter { DataType::BigInt(_) => Ok(InnerValueWriter::BigInt), DataType::Float(_) => Ok(InnerValueWriter::Float), DataType::Double(_) => Ok(InnerValueWriter::Double), + DataType::Decimal(d) => { + // Validation is done at DecimalType construction time + Ok(InnerValueWriter::Decimal(d.precision(), d.scale())) + } + DataType::Date(_) => Ok(InnerValueWriter::Date), + DataType::Time(t) => { + // Validation is done at TimeType construction time + Ok(InnerValueWriter::Time(t.precision())) + } + DataType::Timestamp(t) => { + // Validation is done at TimestampType construction time + Ok(InnerValueWriter::TimestampNtz(t.precision())) + } + DataType::TimestampLTz(t) => { + // Validation is done at TimestampLTzType construction time + Ok(InnerValueWriter::TimestampLtz(t.precision())) + } _ => unimplemented!( "ValueWriter for DataType {:?} is currently not implemented", data_type @@ -194,6 +222,21 @@ impl InnerValueWriter { (InnerValueWriter::Double, Datum::Float64(v)) => { writer.write_double(v.into_inner()); } + (InnerValueWriter::Decimal(p, _s), Datum::Decimal(v)) => { + writer.write_decimal(v, *p); + } + (InnerValueWriter::Date, Datum::Date(d)) => { + writer.write_int(d.get_inner()); + } + (InnerValueWriter::Time(p), Datum::Time(t)) => { + writer.write_time(t.get_inner(), *p); + } + (InnerValueWriter::TimestampNtz(p), Datum::TimestampNtz(ts)) => { + writer.write_timestamp_ntz(ts, *p); + } + (InnerValueWriter::TimestampLtz(p), Datum::TimestampLtz(ts)) => { + writer.write_timestamp_ltz(ts, *p); + } _ => { return Err(IllegalArgument { message: format!("{self:?} used to write value {value:?}"), diff --git a/fluss-rust/crates/fluss/src/row/column.rs b/fluss-rust/crates/fluss/src/row/column.rs index 90437c11aa..615e038454 100644 --- a/fluss-rust/crates/fluss/src/row/column.rs +++ b/fluss-rust/crates/fluss/src/row/column.rs @@ -17,9 +17,10 @@ use crate::row::InternalRow; use arrow::array::{ - AsArray, BinaryArray, FixedSizeBinaryArray, Float32Array, Float64Array, Int8Array, Int16Array, - Int32Array, Int64Array, RecordBatch, StringArray, + Array, AsArray, BinaryArray, Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, + Int8Array, Int16Array, Int32Array, Int64Array, RecordBatch, StringArray, }; +use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; use std::sync::Arc; #[derive(Clone)] @@ -54,6 +55,49 @@ impl ColumnarRow { pub fn get_record_batch(&self) -> &RecordBatch { &self.record_batch } + + /// Generic helper to read timestamp from Arrow, handling all TimeUnit conversions. + /// Like Java, the precision parameter is ignored - conversion is determined by Arrow TimeUnit. + fn read_timestamp_from_arrow( + &self, + pos: usize, + _precision: u32, + construct_compact: impl FnOnce(i64) -> T, + construct_with_nanos: impl FnOnce(i64, i32) -> crate::error::Result, + ) -> T { + let schema = self.record_batch.schema(); + let arrow_field = schema.field(pos); + let value = self.get_long(pos); + + match arrow_field.data_type() { + ArrowDataType::Timestamp(time_unit, _) => { + // Convert based on Arrow TimeUnit + let (millis, nanos) = match time_unit { + TimeUnit::Second => (value * 1000, 0), + TimeUnit::Millisecond => (value, 0), + TimeUnit::Microsecond => { + let millis = value / 1000; + let nanos = ((value % 1000) * 1000) as i32; + (millis, nanos) + } + TimeUnit::Nanosecond => { + let millis = value / 1_000_000; + let nanos = (value % 1_000_000) as i32; + (millis, nanos) + } + }; + + if nanos == 0 { + construct_compact(millis) + } else { + // nanos is guaranteed to be in valid range [0, 999_999] by arithmetic + construct_with_nanos(millis, nanos) + .expect("nanos in valid range by construction") + } + } + other => panic!("Expected Timestamp column at position {pos}, got {other:?}"), + } + } } impl InternalRow for ColumnarRow { @@ -126,6 +170,88 @@ impl InternalRow for ColumnarRow { .value(self.row_id) } + fn get_decimal(&self, pos: usize, precision: usize, scale: usize) -> crate::row::Decimal { + use arrow::datatypes::DataType; + + let column = self.record_batch.column(pos); + let array = column + .as_any() + .downcast_ref::() + .unwrap_or_else(|| { + panic!( + "Expected Decimal128Array at column {}, found: {:?}", + pos, + column.data_type() + ) + }); + + // Contract: caller must check is_null_at() before calling get_decimal. + // Calling on null value violates the contract and returns garbage data + debug_assert!( + !array.is_null(self.row_id), + "get_decimal called on null value at pos {} row {}", + pos, + self.row_id + ); + + // Read scale from Arrow schema field metadata + let schema = self.record_batch.schema(); + let field = schema.field(pos); + let arrow_scale = match field.data_type() { + DataType::Decimal128(_p, s) => *s as i64, + dt => panic!( + "Expected Decimal128 data type at column {}, found: {:?}", + pos, dt + ), + }; + + let i128_val = array.value(self.row_id); + + // Convert Arrow Decimal128 to Fluss Decimal (handles rescaling and validation) + crate::row::Decimal::from_arrow_decimal128( + i128_val, + arrow_scale, + precision as u32, + scale as u32, + ) + .unwrap_or_else(|e| { + panic!( + "Failed to create Decimal at column {} row {}: {}", + pos, self.row_id, e + ) + }) + } + + fn get_date(&self, pos: usize) -> crate::row::datum::Date { + crate::row::datum::Date::new(self.get_int(pos)) + } + + fn get_time(&self, pos: usize) -> crate::row::datum::Time { + crate::row::datum::Time::new(self.get_int(pos)) + } + + fn get_timestamp_ntz(&self, pos: usize, precision: u32) -> crate::row::datum::TimestampNtz { + // Like Java's ArrowTimestampNtzColumnVector, we ignore the precision parameter + // and determine the conversion from the Arrow column's TimeUnit. + self.read_timestamp_from_arrow( + pos, + precision, + crate::row::datum::TimestampNtz::new, + crate::row::datum::TimestampNtz::from_millis_nanos, + ) + } + + fn get_timestamp_ltz(&self, pos: usize, precision: u32) -> crate::row::datum::TimestampLtz { + // Like Java's ArrowTimestampLtzColumnVector, we ignore the precision parameter + // and determine the conversion from the Arrow column's TimeUnit. + self.read_timestamp_from_arrow( + pos, + precision, + crate::row::datum::TimestampLtz::new, + crate::row::datum::TimestampLtz::from_millis_nanos, + ) + } + fn get_char(&self, pos: usize, _length: usize) -> &str { let array = self .record_batch @@ -229,4 +355,72 @@ mod tests { row.set_row_id(0); assert_eq!(row.get_row_id(), 0); } + + #[test] + fn columnar_row_reads_decimal() { + use arrow::datatypes::DataType; + use bigdecimal::{BigDecimal, num_bigint::BigInt}; + + // Test with Decimal128 + let schema = Arc::new(Schema::new(vec![ + Field::new("dec1", DataType::Decimal128(10, 2), false), + Field::new("dec2", DataType::Decimal128(20, 5), false), + Field::new("dec3", DataType::Decimal128(38, 10), false), + ])); + + // Create decimal values: 123.45, 12345.67890, large decimal + let dec1_val = 12345i128; // 123.45 with scale 2 + let dec2_val = 1234567890i128; // 12345.67890 with scale 5 + let dec3_val = 999999999999999999i128; // Large value (18 nines) with scale 10 + + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new( + Decimal128Array::from(vec![dec1_val]) + .with_precision_and_scale(10, 2) + .unwrap(), + ), + Arc::new( + Decimal128Array::from(vec![dec2_val]) + .with_precision_and_scale(20, 5) + .unwrap(), + ), + Arc::new( + Decimal128Array::from(vec![dec3_val]) + .with_precision_and_scale(38, 10) + .unwrap(), + ), + ], + ) + .expect("record batch"); + + let row = ColumnarRow::new(Arc::new(batch)); + assert_eq!(row.get_field_count(), 3); + + // Verify decimal values + assert_eq!( + row.get_decimal(0, 10, 2), + crate::row::Decimal::from_big_decimal(BigDecimal::new(BigInt::from(12345), 2), 10, 2) + .unwrap() + ); + assert_eq!( + row.get_decimal(1, 20, 5), + crate::row::Decimal::from_big_decimal( + BigDecimal::new(BigInt::from(1234567890), 5), + 20, + 5 + ) + .unwrap() + ); + assert_eq!( + row.get_decimal(2, 38, 10), + crate::row::Decimal::from_big_decimal( + BigDecimal::new(BigInt::from(999999999999999999i128), 10), + 38, + 10 + ) + .unwrap() + ); + } } diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs index 1152b0c5d2..339e366155 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs @@ -20,6 +20,7 @@ use bytes::Bytes; use crate::error::Result; use crate::metadata::DataType; +use crate::row::Decimal; use crate::row::binary::{BinaryRowFormat, BinaryWriter, ValueWriter}; use delegate::delegate; @@ -93,7 +94,13 @@ impl BinaryWriter for CompactedKeyWriter { fn write_double(&mut self, value: f64); + fn write_decimal(&mut self, value: &Decimal, precision: u32); + fn write_time(&mut self, value: i32, precision: u32); + + fn write_timestamp_ntz(&mut self, value: &crate::row::datum::TimestampNtz, precision: u32); + + fn write_timestamp_ltz(&mut self, value: &crate::row::datum::TimestampLtz, precision: u32); } } diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs index 144f8985cf..bc68ea10e5 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs @@ -133,6 +133,26 @@ impl<'a> InternalRow for CompactedRow<'a> { fn get_bytes(&self, pos: usize) -> &[u8] { self.decoded_row().get_bytes(pos) } + + fn get_decimal(&self, pos: usize, precision: usize, scale: usize) -> crate::row::Decimal { + self.decoded_row().get_decimal(pos, precision, scale) + } + + fn get_date(&self, pos: usize) -> crate::row::datum::Date { + self.decoded_row().get_date(pos) + } + + fn get_time(&self, pos: usize) -> crate::row::datum::Time { + self.decoded_row().get_time(pos) + } + + fn get_timestamp_ntz(&self, pos: usize, precision: u32) -> crate::row::datum::TimestampNtz { + self.decoded_row().get_timestamp_ntz(pos, precision) + } + + fn get_timestamp_ltz(&self, pos: usize, precision: u32) -> crate::row::datum::TimestampLtz { + self.decoded_row().get_timestamp_ltz(pos, precision) + } } #[cfg(test)] @@ -174,7 +194,7 @@ mod tests { writer.write_bytes(&[1, 2, 3, 4, 5]); let bytes = writer.to_bytes(); - let mut row = CompactedRow::from_bytes(&row_type, bytes.as_ref()); + let row = CompactedRow::from_bytes(&row_type, bytes.as_ref()); assert_eq!(row.get_field_count(), 9); assert!(row.get_boolean(0)); @@ -187,70 +207,107 @@ mod tests { assert_eq!(row.get_string(7), "Hello World"); assert_eq!(row.get_bytes(8), &[1, 2, 3, 4, 5]); - // Test with nulls - let row_type = RowType::with_data_types( - [ - DataType::Int(IntType::new()), - DataType::String(StringType::new()), - DataType::Double(DoubleType::new()), - ] - .to_vec(), - ); + // Test with nulls and negative values + let row_type = RowType::with_data_types(vec![ + DataType::Int(IntType::new()), + DataType::String(StringType::new()), + DataType::Double(DoubleType::new()), + ]); let mut writer = CompactedRowWriter::new(row_type.fields().len()); - - writer.write_int(100); + writer.write_int(-42); writer.set_null_at(1); writer.write_double(2.71); let bytes = writer.to_bytes(); - row = CompactedRow::from_bytes(&row_type, bytes.as_ref()); + let row = CompactedRow::from_bytes(&row_type, bytes.as_ref()); assert!(!row.is_null_at(0)); assert!(row.is_null_at(1)); assert!(!row.is_null_at(2)); - assert_eq!(row.get_int(0), 100); + assert_eq!(row.get_int(0), -42); assert_eq!(row.get_double(2), 2.71); + // Verify caching works on repeated reads + assert_eq!(row.get_int(0), -42); + } - // Test multiple reads (caching) - assert_eq!(row.get_int(0), 100); - assert_eq!(row.get_int(0), 100); + #[test] + fn test_compacted_row_temporal_and_decimal_types() { + // Comprehensive test covering DATE, TIME, TIMESTAMP (compact/non-compact), and DECIMAL (compact/non-compact) + use crate::metadata::{DataTypes, DecimalType, TimestampLTzType, TimestampType}; + use crate::row::Decimal; + use crate::row::datum::{TimestampLtz, TimestampNtz}; + use bigdecimal::{BigDecimal, num_bigint::BigInt}; - // Test from_bytes let row_type = RowType::with_data_types(vec![ - DataType::Int(IntType::new()), - DataType::String(StringType::new()), + DataTypes::date(), + DataTypes::time(), + DataType::Timestamp(TimestampType::with_nullable(true, 3).unwrap()), // Compact (precision <= 3) + DataType::TimestampLTz(TimestampLTzType::with_nullable(true, 3).unwrap()), // Compact + DataType::Timestamp(TimestampType::with_nullable(true, 6).unwrap()), // Non-compact (precision > 3) + DataType::TimestampLTz(TimestampLTzType::with_nullable(true, 9).unwrap()), // Non-compact + DataType::Decimal(DecimalType::new(10, 2).unwrap()), // Compact (precision <= 18) + DataType::Decimal(DecimalType::new(28, 10).unwrap()), // Non-compact (precision > 18) ]); let mut writer = CompactedRowWriter::new(row_type.fields().len()); - writer.write_int(-1); - writer.write_string("test"); - - let bytes = writer.to_bytes(); - let mut row = CompactedRow::from_bytes(&row_type, bytes.as_ref()); - - assert_eq!(row.get_int(0), -1); - assert_eq!(row.get_string(1), "test"); - // Test large row - let num_fields = 100; - let row_type = RowType::with_data_types( - (0..num_fields) - .map(|_| DataType::Int(IntType::new())) - .collect(), - ); - - let mut writer = CompactedRowWriter::new(num_fields); + // Write values + writer.write_int(19651); // Date: 2023-10-25 + writer.write_time(34200000, 0); // Time: 09:30:00.0 + writer.write_timestamp_ntz(&TimestampNtz::new(1698235273182), 3); // Compact timestamp + writer.write_timestamp_ltz(&TimestampLtz::new(1698235273182), 3); // Compact timestamp ltz + let ts_ntz_high = TimestampNtz::from_millis_nanos(1698235273182, 123456).unwrap(); + let ts_ltz_high = TimestampLtz::from_millis_nanos(1698235273182, 987654).unwrap(); + writer.write_timestamp_ntz(&ts_ntz_high, 6); // Non-compact timestamp with nanos + writer.write_timestamp_ltz(&ts_ltz_high, 9); // Non-compact timestamp ltz with nanos + + // Create Decimal values for testing + let small_decimal = + Decimal::from_big_decimal(BigDecimal::new(BigInt::from(12345), 2), 10, 2).unwrap(); // Compact decimal: 123.45 + let large_decimal = Decimal::from_big_decimal( + BigDecimal::new(BigInt::from(999999999999999999i128), 10), + 28, + 10, + ) + .unwrap(); // Non-compact decimal - for i in 0..num_fields { - writer.write_int((i * 10) as i32); - } + writer.write_decimal(&small_decimal, 10); + writer.write_decimal(&large_decimal, 28); let bytes = writer.to_bytes(); - row = CompactedRow::from_bytes(&row_type, bytes.as_ref()); - - for i in 0..num_fields { - assert_eq!(row.get_int(i), (i * 10) as i32); - } + let row = CompactedRow::from_bytes(&row_type, bytes.as_ref()); + + // Verify all values + assert_eq!(row.get_date(0).get_inner(), 19651); + assert_eq!(row.get_time(1).get_inner(), 34200000); + assert_eq!(row.get_timestamp_ntz(2, 3).get_millisecond(), 1698235273182); + assert_eq!( + row.get_timestamp_ltz(3, 3).get_epoch_millisecond(), + 1698235273182 + ); + let read_ts_ntz = row.get_timestamp_ntz(4, 6); + assert_eq!(read_ts_ntz.get_millisecond(), 1698235273182); + assert_eq!(read_ts_ntz.get_nano_of_millisecond(), 123456); + let read_ts_ltz = row.get_timestamp_ltz(5, 9); + assert_eq!(read_ts_ltz.get_epoch_millisecond(), 1698235273182); + assert_eq!(read_ts_ltz.get_nano_of_millisecond(), 987654); + // Assert on Decimal equality + assert_eq!(row.get_decimal(6, 10, 2), small_decimal); + assert_eq!(row.get_decimal(7, 28, 10), large_decimal); + + // Assert on Decimal components to catch any regressions + let read_small_decimal = row.get_decimal(6, 10, 2); + assert_eq!(read_small_decimal.precision(), 10); + assert_eq!(read_small_decimal.scale(), 2); + assert_eq!(read_small_decimal.to_unscaled_long().unwrap(), 12345); + + let read_large_decimal = row.get_decimal(7, 28, 10); + assert_eq!(read_large_decimal.precision(), 28); + assert_eq!(read_large_decimal.scale(), 10); + assert_eq!( + read_large_decimal.to_unscaled_long().unwrap(), + 999999999999999999i64 + ); } } diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs index 408706cc83..40470db170 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs @@ -19,7 +19,7 @@ use crate::metadata::RowType; use crate::row::compacted::compacted_row::calculate_bit_set_width_in_bytes; use crate::{ metadata::DataType, - row::{Datum, GenericRow, compacted::compacted_row_writer::CompactedRowWriter}, + row::{Datum, Decimal, GenericRow, compacted::compacted_row_writer::CompactedRowWriter}, util::varint::{read_unsigned_varint_at, read_unsigned_varint_u64_at}, }; use std::borrow::Cow; @@ -97,7 +97,75 @@ impl<'a> CompactedRowDeserializer<'a> { let (val, next) = reader.read_bytes(cursor); (Datum::Blob(val.into()), next) } - _ => panic!("unsupported DataType in CompactedRowDeserializer"), + DataType::Decimal(decimal_type) => { + let precision = decimal_type.precision(); + let scale = decimal_type.scale(); + if Decimal::is_compact_precision(precision) { + // Compact: stored as i64 + let (val, next) = reader.read_long(cursor); + let decimal = Decimal::from_unscaled_long(val, precision, scale) + .expect("Failed to create decimal from unscaled long"); + (Datum::Decimal(decimal), next) + } else { + // Non-compact: stored as minimal big-endian bytes + let (bytes, next) = reader.read_bytes(cursor); + let decimal = Decimal::from_unscaled_bytes(bytes, precision, scale) + .expect("Failed to create decimal from unscaled bytes"); + (Datum::Decimal(decimal), next) + } + } + DataType::Date(_) => { + let (val, next) = reader.read_int(cursor); + (Datum::Date(crate::row::datum::Date::new(val)), next) + } + DataType::Time(_) => { + let (val, next) = reader.read_int(cursor); + (Datum::Time(crate::row::datum::Time::new(val)), next) + } + DataType::Timestamp(timestamp_type) => { + let precision = timestamp_type.precision(); + if crate::row::datum::TimestampNtz::is_compact(precision) { + // Compact: only milliseconds + let (millis, next) = reader.read_long(cursor); + ( + Datum::TimestampNtz(crate::row::datum::TimestampNtz::new(millis)), + next, + ) + } else { + // Non-compact: milliseconds + nanos + let (millis, mid) = reader.read_long(cursor); + let (nanos, next) = reader.read_int(mid); + let timestamp = + crate::row::datum::TimestampNtz::from_millis_nanos(millis, nanos) + .expect("Invalid nano_of_millisecond value in compacted row"); + (Datum::TimestampNtz(timestamp), next) + } + } + DataType::TimestampLTz(timestamp_ltz_type) => { + let precision = timestamp_ltz_type.precision(); + if crate::row::datum::TimestampLtz::is_compact(precision) { + // Compact: only epoch milliseconds + let (epoch_millis, next) = reader.read_long(cursor); + ( + Datum::TimestampLtz(crate::row::datum::TimestampLtz::new(epoch_millis)), + next, + ) + } else { + // Non-compact: epoch milliseconds + nanos + let (epoch_millis, mid) = reader.read_long(cursor); + let (nanos, next) = reader.read_int(mid); + let timestamp_ltz = + crate::row::datum::TimestampLtz::from_millis_nanos(epoch_millis, nanos) + .expect("Invalid nano_of_millisecond value in compacted row"); + (Datum::TimestampLtz(timestamp_ltz), next) + } + } + _ => { + panic!( + "Unsupported DataType in CompactedRowDeserializer: {:?}", + dtype + ); + } }; cursor = next_cursor; row.set_field(col_pos, datum); diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs index c130e94cce..d1ad047a72 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::row::Decimal; use crate::row::binary::BinaryWriter; use crate::row::compacted::compacted_row::calculate_bit_set_width_in_bytes; use crate::util::varint::{write_unsigned_varint_to_slice, write_unsigned_varint_u64_to_slice}; @@ -76,6 +77,7 @@ impl CompactedRowWriter { self.position = end; } } + impl BinaryWriter for CompactedRowWriter { fn reset(&mut self) { self.position = self.header_size_in_bytes; @@ -91,32 +93,34 @@ impl BinaryWriter for CompactedRowWriter { fn write_boolean(&mut self, value: bool) { let b = if value { 1u8 } else { 0u8 }; - self.write_raw(&[b]); + self.write_raw(&[b]) } fn write_byte(&mut self, value: u8) { - self.write_raw(&[value]); + self.write_raw(&[value]) } fn write_bytes(&mut self, value: &[u8]) { - let len_i32 = - i32::try_from(value.len()).expect("byte slice too large to encode length as i32"); + let len_i32 = i32::try_from(value.len()) + .expect("Byte slice too large to encode length as i32: exceeds i32::MAX"); self.write_int(len_i32); - self.write_raw(value); + self.write_raw(value) } fn write_char(&mut self, value: &str, _length: usize) { // TODO: currently, we encoding CHAR(length) as the same with STRING, the length info can be // omitted and the bytes length should be enforced in the future. - self.write_string(value); + self.write_string(value) } fn write_string(&mut self, value: &str) { - self.write_bytes(value.as_ref()); + self.write_bytes(value.as_ref()) } fn write_short(&mut self, value: i16) { - self.write_raw(&value.to_ne_bytes()); + // Use native endianness to match Java's UnsafeUtils.putShort behavior + // Java uses sun.misc.Unsafe which writes in native byte order (typically LE on x86/ARM) + self.write_raw(&value.to_ne_bytes()) } fn write_int(&mut self, value: i32) { @@ -132,21 +136,120 @@ impl BinaryWriter for CompactedRowWriter { write_unsigned_varint_u64_to_slice(value as u64, &mut self.buffer[self.position..]); self.position += bytes_written; } + fn write_float(&mut self, value: f32) { - self.write_raw(&value.to_ne_bytes()); + // Use native endianness to match Java's UnsafeUtils.putFloat behavior + self.write_raw(&value.to_ne_bytes()) } fn write_double(&mut self, value: f64) { - self.write_raw(&value.to_ne_bytes()); + // Use native endianness to match Java's UnsafeUtils.putDouble behavior + self.write_raw(&value.to_ne_bytes()) } fn write_binary(&mut self, bytes: &[u8], length: usize) { // TODO: currently, we encoding BINARY(length) as the same with BYTES, the length info can // be omitted and the bytes length should be enforced in the future. - self.write_bytes(&bytes[..length.min(bytes.len())]); + self.write_bytes(&bytes[..length.min(bytes.len())]) } fn complete(&mut self) { // do nothing } + + fn write_decimal(&mut self, value: &Decimal, precision: u32) { + // Decimal is already validated and rescaled during construction. + // Just serialize the precomputed unscaled representation. + if Decimal::is_compact_precision(precision) { + self.write_long( + value + .to_unscaled_long() + .expect("Decimal should fit in i64 for compact precision"), + ) + } else { + self.write_bytes(&value.to_unscaled_bytes()) + } + } + + fn write_time(&mut self, value: i32, _precision: u32) { + // TIME is always encoded as i32 (milliseconds since midnight) regardless of precision + self.write_int(value) + } + + fn write_timestamp_ntz(&mut self, value: &crate::row::datum::TimestampNtz, precision: u32) { + if crate::row::datum::TimestampNtz::is_compact(precision) { + // Compact: write only milliseconds + self.write_long(value.get_millisecond()); + } else { + // Non-compact: write milliseconds + nanoOfMillisecond + self.write_long(value.get_millisecond()); + self.write_int(value.get_nano_of_millisecond()); + } + } + + fn write_timestamp_ltz(&mut self, value: &crate::row::datum::TimestampLtz, precision: u32) { + if crate::row::datum::TimestampLtz::is_compact(precision) { + // Compact: write only epoch milliseconds + self.write_long(value.get_epoch_millisecond()); + } else { + // Non-compact: write epoch milliseconds + nanoOfMillisecond + self.write_long(value.get_epoch_millisecond()); + self.write_int(value.get_nano_of_millisecond()); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use bigdecimal::{BigDecimal, num_bigint::BigInt}; + + #[test] + fn test_write_decimal_compact() { + // Compact decimal (precision <= 18) + let bd = BigDecimal::new(BigInt::from(12345), 2); // 123.45 + let decimal = Decimal::from_big_decimal(bd, 10, 2).unwrap(); + + let mut w = CompactedRowWriter::new(1); + w.write_decimal(&decimal, 10); + + let (val, _) = crate::util::varint::read_unsigned_varint_u64_at( + w.buffer(), + w.header_size_in_bytes, + CompactedRowWriter::MAX_LONG_SIZE, + ) + .unwrap(); + assert_eq!(val as i64, 12345); + } + + #[test] + fn test_write_decimal_rounding() { + // Test HALF_UP rounding: 12.345 → 12.35 + let bd = BigDecimal::new(BigInt::from(12345), 3); + let decimal = Decimal::from_big_decimal(bd, 10, 2).unwrap(); + + let mut w = CompactedRowWriter::new(1); + w.write_decimal(&decimal, 10); + + let (val, _) = crate::util::varint::read_unsigned_varint_u64_at( + w.buffer(), + w.header_size_in_bytes, + CompactedRowWriter::MAX_LONG_SIZE, + ) + .unwrap(); + assert_eq!(val as i64, 1235); // 12.35 with scale 2 + } + + #[test] + fn test_write_decimal_non_compact() { + // Non-compact (precision > 18): uses byte array + let bd = BigDecimal::new(BigInt::from(12345), 0); + let decimal = Decimal::from_big_decimal(bd, 28, 0).unwrap(); + + let mut w = CompactedRowWriter::new(1); + w.write_decimal(&decimal, 28); + + // Verify something was written (at least length varint + some bytes) + assert!(w.position() > w.header_size_in_bytes); + } } diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs index ad7948dcef..5b21b3899b 100644 --- a/fluss-rust/crates/fluss/src/row/datum.rs +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -17,6 +17,7 @@ use crate::error::Error::RowConvertError; use crate::error::Result; +use crate::row::Decimal; use arrow::array::{ ArrayBuilder, BinaryBuilder, BooleanBuilder, Float32Builder, Float64Builder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, StringBuilder, @@ -24,7 +25,6 @@ use arrow::array::{ use jiff::ToSpan; use ordered_float::OrderedFloat; use parse_display::Display; -use rust_decimal::Decimal; use serde::Serialize; use std::borrow::Cow; @@ -58,9 +58,11 @@ pub enum Datum<'a> { #[display("{0}")] Date(Date), #[display("{0}")] - Timestamp(Timestamp), + Time(Time), #[display("{0}")] - TimestampTz(TimestampLtz), + TimestampNtz(TimestampNtz), + #[display("{0}")] + TimestampLtz(TimestampLtz), } impl Datum<'_> { @@ -296,7 +298,11 @@ impl Datum<'_> { Datum::Float64(v) => append_value_to_arrow!(Float64Builder, v.into_inner()), Datum::String(v) => append_value_to_arrow!(StringBuilder, v.as_ref()), Datum::Blob(v) => append_value_to_arrow!(BinaryBuilder, v.as_ref()), - Datum::Decimal(_) | Datum::Date(_) | Datum::Timestamp(_) | Datum::TimestampTz(_) => { + Datum::Decimal(_) + | Datum::Date(_) + | Datum::Time(_) + | Datum::TimestampNtz(_) + | Datum::TimestampLtz(_) => { return Err(RowConvertError { message: format!( "Type {:?} is not yet supported for Arrow conversion", @@ -350,10 +356,122 @@ pub type F64 = OrderedFloat; pub struct Date(i32); #[derive(PartialOrd, Ord, Display, PartialEq, Eq, Debug, Copy, Clone, Default, Hash, Serialize)] -pub struct Timestamp(i64); +pub struct Time(i32); + +impl Time { + pub const fn new(inner: i32) -> Self { + Time(inner) + } + + /// Get the inner value of time type (milliseconds since midnight) + pub fn get_inner(&self) -> i32 { + self.0 + } +} + +/// Maximum timestamp precision that can be stored compactly (milliseconds only). +/// Values with precision > MAX_COMPACT_TIMESTAMP_PRECISION require additional nanosecond storage. +pub const MAX_COMPACT_TIMESTAMP_PRECISION: u32 = 3; + +/// Maximum valid value for nanoseconds within a millisecond (0 to 999,999 inclusive). +/// A millisecond contains 1,000,000 nanoseconds, so the fractional part ranges from 0 to 999,999. +pub const MAX_NANO_OF_MILLISECOND: i32 = 999_999; #[derive(PartialOrd, Ord, Display, PartialEq, Eq, Debug, Copy, Clone, Default, Hash, Serialize)] -pub struct TimestampLtz(i64); +#[display("{millisecond}")] +pub struct TimestampNtz { + millisecond: i64, + nano_of_millisecond: i32, +} + +impl TimestampNtz { + pub const fn new(millisecond: i64) -> Self { + TimestampNtz { + millisecond, + nano_of_millisecond: 0, + } + } + + pub fn from_millis_nanos( + millisecond: i64, + nano_of_millisecond: i32, + ) -> crate::error::Result { + if !(0..=MAX_NANO_OF_MILLISECOND).contains(&nano_of_millisecond) { + return Err(crate::error::Error::IllegalArgument { + message: format!( + "nanoOfMillisecond must be in range [0, {}], got: {}", + MAX_NANO_OF_MILLISECOND, nano_of_millisecond + ), + }); + } + Ok(TimestampNtz { + millisecond, + nano_of_millisecond, + }) + } + + pub fn get_millisecond(&self) -> i64 { + self.millisecond + } + + pub fn get_nano_of_millisecond(&self) -> i32 { + self.nano_of_millisecond + } + + /// Check if the timestamp is compact based on precision. + /// Precision <= MAX_COMPACT_TIMESTAMP_PRECISION means millisecond precision, no need for nanos. + pub fn is_compact(precision: u32) -> bool { + precision <= MAX_COMPACT_TIMESTAMP_PRECISION + } +} + +#[derive(PartialOrd, Ord, Display, PartialEq, Eq, Debug, Copy, Clone, Default, Hash, Serialize)] +#[display("{epoch_millisecond}")] +pub struct TimestampLtz { + epoch_millisecond: i64, + nano_of_millisecond: i32, +} + +impl TimestampLtz { + pub const fn new(epoch_millisecond: i64) -> Self { + TimestampLtz { + epoch_millisecond, + nano_of_millisecond: 0, + } + } + + pub fn from_millis_nanos( + epoch_millisecond: i64, + nano_of_millisecond: i32, + ) -> crate::error::Result { + if !(0..=MAX_NANO_OF_MILLISECOND).contains(&nano_of_millisecond) { + return Err(crate::error::Error::IllegalArgument { + message: format!( + "nanoOfMillisecond must be in range [0, {}], got: {}", + MAX_NANO_OF_MILLISECOND, nano_of_millisecond + ), + }); + } + Ok(TimestampLtz { + epoch_millisecond, + nano_of_millisecond, + }) + } + + pub fn get_epoch_millisecond(&self) -> i64 { + self.epoch_millisecond + } + + pub fn get_nano_of_millisecond(&self) -> i32 { + self.nano_of_millisecond + } + + /// Check if the timestamp is compact based on precision. + /// Precision <= MAX_COMPACT_TIMESTAMP_PRECISION means millisecond precision, no need for nanos. + pub fn is_compact(precision: u32) -> bool { + precision <= MAX_COMPACT_TIMESTAMP_PRECISION + } +} pub type Blob<'a> = Cow<'a, [u8]>; @@ -461,3 +579,54 @@ mod tests { assert_eq!(date.day(), 1); } } + +#[cfg(test)] +mod timestamp_tests { + use super::*; + + #[test] + fn test_timestamp_valid_nanos() { + // Valid range: 0 to MAX_NANO_OF_MILLISECOND for both TimestampNtz and TimestampLtz + let ntz1 = TimestampNtz::from_millis_nanos(1000, 0).unwrap(); + assert_eq!(ntz1.get_nano_of_millisecond(), 0); + + let ntz2 = TimestampNtz::from_millis_nanos(1000, MAX_NANO_OF_MILLISECOND).unwrap(); + assert_eq!(ntz2.get_nano_of_millisecond(), MAX_NANO_OF_MILLISECOND); + + let ntz3 = TimestampNtz::from_millis_nanos(1000, 500_000).unwrap(); + assert_eq!(ntz3.get_nano_of_millisecond(), 500_000); + + let ltz1 = TimestampLtz::from_millis_nanos(1000, 0).unwrap(); + assert_eq!(ltz1.get_nano_of_millisecond(), 0); + + let ltz2 = TimestampLtz::from_millis_nanos(1000, MAX_NANO_OF_MILLISECOND).unwrap(); + assert_eq!(ltz2.get_nano_of_millisecond(), MAX_NANO_OF_MILLISECOND); + } + + #[test] + fn test_timestamp_nanos_out_of_range() { + // Test that both TimestampNtz and TimestampLtz reject invalid nanos + let expected_msg = format!( + "nanoOfMillisecond must be in range [0, {}]", + MAX_NANO_OF_MILLISECOND + ); + + // Too large (1,000,000 is just beyond the valid range) + let result_ntz = TimestampNtz::from_millis_nanos(1000, MAX_NANO_OF_MILLISECOND + 1); + assert!(result_ntz.is_err()); + assert!(result_ntz.unwrap_err().to_string().contains(&expected_msg)); + + let result_ltz = TimestampLtz::from_millis_nanos(1000, MAX_NANO_OF_MILLISECOND + 1); + assert!(result_ltz.is_err()); + assert!(result_ltz.unwrap_err().to_string().contains(&expected_msg)); + + // Negative + let result_ntz = TimestampNtz::from_millis_nanos(1000, -1); + assert!(result_ntz.is_err()); + assert!(result_ntz.unwrap_err().to_string().contains(&expected_msg)); + + let result_ltz = TimestampLtz::from_millis_nanos(1000, -1); + assert!(result_ltz.is_err()); + assert!(result_ltz.unwrap_err().to_string().contains(&expected_msg)); + } +} diff --git a/fluss-rust/crates/fluss/src/row/decimal.rs b/fluss-rust/crates/fluss/src/row/decimal.rs new file mode 100644 index 0000000000..b14bde50a4 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/decimal.rs @@ -0,0 +1,477 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::{Error, Result}; +use bigdecimal::num_bigint::BigInt; +use bigdecimal::num_traits::Zero; +use bigdecimal::{BigDecimal, RoundingMode}; +use std::fmt; + +#[cfg(test)] +use std::str::FromStr; + +/// Maximum decimal precision that can be stored compactly as a single i64. +/// Values with precision > MAX_COMPACT_PRECISION require byte array storage. +pub const MAX_COMPACT_PRECISION: u32 = 18; + +/// An internal data structure representing a decimal value with fixed precision and scale. +/// +/// This data structure is immutable and stores decimal values in a compact representation +/// (as a long value) if values are small enough (precision ≤ 18). +/// +/// Matches Java's org.apache.fluss.row.Decimal class. +#[derive(Debug, Clone, serde::Serialize)] +pub struct Decimal { + precision: u32, + scale: u32, + // If precision <= MAX_COMPACT_PRECISION, this holds the unscaled value + long_val: Option, + // BigDecimal representation (may be cached) + decimal_val: Option, +} + +impl Decimal { + /// Returns the precision of this Decimal. + /// + /// The precision is the number of digits in the unscaled value. + pub fn precision(&self) -> u32 { + self.precision + } + + /// Returns the scale of this Decimal. + pub fn scale(&self) -> u32 { + self.scale + } + + /// Returns whether the decimal value is small enough to be stored in a long. + pub fn is_compact(&self) -> bool { + self.precision <= MAX_COMPACT_PRECISION + } + + /// Returns whether a given precision can be stored compactly. + pub fn is_compact_precision(precision: u32) -> bool { + precision <= MAX_COMPACT_PRECISION + } + + /// Converts this Decimal into a BigDecimal. + pub fn to_big_decimal(&self) -> BigDecimal { + if let Some(bd) = &self.decimal_val { + bd.clone() + } else if let Some(long_val) = self.long_val { + BigDecimal::new(BigInt::from(long_val), self.scale as i64) + } else { + // Should never happen - we always have one representation + BigDecimal::new(BigInt::from(0), self.scale as i64) + } + } + + /// Returns a long describing the unscaled value of this Decimal. + pub fn to_unscaled_long(&self) -> Result { + if let Some(long_val) = self.long_val { + Ok(long_val) + } else { + // Extract unscaled value from BigDecimal + let bd = self.to_big_decimal(); + let (unscaled, _) = bd.as_bigint_and_exponent(); + unscaled.try_into().map_err(|_| Error::IllegalArgument { + message: format!( + "Decimal unscaled value does not fit in i64: precision={}", + self.precision + ), + }) + } + } + + /// Returns a byte array describing the unscaled value of this Decimal. + pub fn to_unscaled_bytes(&self) -> Vec { + let bd = self.to_big_decimal(); + let (unscaled, _) = bd.as_bigint_and_exponent(); + unscaled.to_signed_bytes_be() + } + + /// Creates a Decimal from Arrow's Decimal128 representation. + // TODO: For compact decimals with matching scale we may call from_unscaled_long + pub fn from_arrow_decimal128( + i128_val: i128, + arrow_scale: i64, + precision: u32, + scale: u32, + ) -> Result { + let bd = BigDecimal::new(BigInt::from(i128_val), arrow_scale); + Self::from_big_decimal(bd, precision, scale) + } + + /// Creates an instance of Decimal from a BigDecimal with the given precision and scale. + /// + /// The returned decimal value may be rounded to have the desired scale. The precision + /// will be checked. If the precision overflows, an error is returned. + pub fn from_big_decimal(bd: BigDecimal, precision: u32, scale: u32) -> Result { + // Rescale to the target scale with HALF_UP rounding (matches Java) + let scaled = bd.with_scale_round(scale as i64, RoundingMode::HalfUp); + + // Extract unscaled value + let (unscaled, exp) = scaled.as_bigint_and_exponent(); + + // Sanity check that scale matches + debug_assert_eq!( + exp, scale as i64, + "Scaled decimal exponent ({}) != expected scale ({})", + exp, scale + ); + + let actual_precision = Self::compute_precision(&unscaled); + if actual_precision > precision as usize { + return Err(Error::IllegalArgument { + message: format!( + "Decimal precision overflow: value has {} digits but precision is {} (value: {})", + actual_precision, precision, scaled + ), + }); + } + + // Compute compact representation if possible + let long_val = if precision <= MAX_COMPACT_PRECISION { + Some(i64::try_from(&unscaled).map_err(|_| Error::IllegalArgument { + message: format!( + "Decimal mantissa exceeds i64 range for compact precision {}: unscaled={} (value={})", + precision, unscaled, scaled + ), + })?) + } else { + None + }; + + Ok(Decimal { + precision, + scale, + long_val, + decimal_val: Some(scaled), + }) + } + + /// Creates an instance of Decimal from an unscaled long value with the given precision and scale. + pub fn from_unscaled_long(unscaled_long: i64, precision: u32, scale: u32) -> Result { + if precision > MAX_COMPACT_PRECISION { + return Err(Error::IllegalArgument { + message: format!( + "Precision {} exceeds MAX_COMPACT_PRECISION ({})", + precision, MAX_COMPACT_PRECISION + ), + }); + } + + let actual_precision = Self::compute_precision(&BigInt::from(unscaled_long)); + if actual_precision > precision as usize { + return Err(Error::IllegalArgument { + message: format!( + "Decimal precision overflow: unscaled value has {} digits but precision is {}", + actual_precision, precision + ), + }); + } + + Ok(Decimal { + precision, + scale, + long_val: Some(unscaled_long), + decimal_val: None, + }) + } + + /// Creates an instance of Decimal from an unscaled byte array with the given precision and scale. + pub fn from_unscaled_bytes(unscaled_bytes: &[u8], precision: u32, scale: u32) -> Result { + let unscaled = BigInt::from_signed_bytes_be(unscaled_bytes); + let bd = BigDecimal::new(unscaled, scale as i64); + Self::from_big_decimal(bd, precision, scale) + } + + /// Computes the precision of a decimal's unscaled value, matching Java's BigDecimal.precision(). + pub fn compute_precision(unscaled: &BigInt) -> usize { + if unscaled.is_zero() { + return 1; + } + + // Count ALL digits in the unscaled value (matches Java's BigDecimal.precision()) + // For bounded precision (≤ 38 digits), string conversion is cheap and simple. + unscaled.magnitude().to_str_radix(10).len() + } +} + +impl fmt::Display for Decimal { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.to_big_decimal()) + } +} + +// Manual implementations of comparison traits to ignore cached fields +impl PartialEq for Decimal { + fn eq(&self, other: &Self) -> bool { + // Use numeric equality like Java's Decimal.equals() which delegates to compareTo. + // This means 1.0 (scale=1) equals 1.00 (scale=2). + self.cmp(other) == std::cmp::Ordering::Equal + } +} + +impl Eq for Decimal {} + +impl PartialOrd for Decimal { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Decimal { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // If both are compact and have the same scale, compare directly + if self.is_compact() && other.is_compact() && self.scale == other.scale { + self.long_val.cmp(&other.long_val) + } else { + // Otherwise, compare as BigDecimal + self.to_big_decimal().cmp(&other.to_big_decimal()) + } + } +} + +impl std::hash::Hash for Decimal { + fn hash(&self, state: &mut H) { + // Hash the BigDecimal representation. + // + // IMPORTANT: Unlike Java's BigDecimal, Rust's bigdecimal crate normalizes + // before hashing, so hash(1.0) == hash(1.00). Combined with our numeric + // equality (1.0 == 1.00), this CORRECTLY satisfies the hash/equals contract. + // + // This is BETTER than Java's implementation which has a hash/equals violation: + // - Java: equals(1.0, 1.00) = true, but hashCode(1.0) != hashCode(1.00) + // - Rust: equals(1.0, 1.00) = true, and hash(1.0) == hash(1.00) ✓ + // + // Result: HashMap/HashSet will work correctly even if you create Decimals + // with different scales for the same numeric value (though this is rare in + // practice since decimals are schema-driven with fixed precision/scale). + self.to_big_decimal().hash(state); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_precision_calculation() { + // Zero is special case + assert_eq!(Decimal::compute_precision(&BigInt::from(0)), 1); + + // Must count ALL digits including trailing zeros (matches Java BigDecimal.precision()) + assert_eq!(Decimal::compute_precision(&BigInt::from(10)), 2); + assert_eq!(Decimal::compute_precision(&BigInt::from(100)), 3); + assert_eq!(Decimal::compute_precision(&BigInt::from(12300)), 5); + assert_eq!( + Decimal::compute_precision(&BigInt::from(10000000000i64)), + 11 + ); + + // Test the case: value=1, scale=10 → unscaled=10000000000 (11 digits) + let bd = BigDecimal::new(BigInt::from(1), 0); + assert!( + Decimal::from_big_decimal(bd.clone(), 1, 10).is_err(), + "Should reject: unscaled 10000000000 has 11 digits, precision=1 is too small" + ); + assert!( + Decimal::from_big_decimal(bd, 11, 10).is_ok(), + "Should accept with correct precision=11" + ); + } + + /// Test precision validation boundaries + #[test] + fn test_precision_validation() { + let test_cases = vec![ + (10i64, 1, 2), // 1.0 → unscaled: 10 (2 digits) + (100i64, 2, 3), // 1.00 → unscaled: 100 (3 digits) + (10000000000i64, 10, 11), // 1.0000000000 → unscaled: 10000000000 (11 digits) + ]; + + for (unscaled, scale, min_precision) in test_cases { + let bd = BigDecimal::new(BigInt::from(unscaled), scale as i64); + + // Reject if precision too small + assert!(Decimal::from_big_decimal(bd.clone(), min_precision - 1, scale).is_err()); + // Accept with correct precision + assert!(Decimal::from_big_decimal(bd, min_precision, scale).is_ok()); + } + + // i64::MAX has 19 digits, should reject with precision=5 + let bd = BigDecimal::new(BigInt::from(i64::MAX), 0); + assert!(Decimal::from_big_decimal(bd, 5, 0).is_err()); + } + + /// Test creation and basic operations for both compact and non-compact decimals + #[test] + fn test_creation_and_representation() { + // Compact (precision ≤ 18): from unscaled long + let compact = Decimal::from_unscaled_long(12345, 10, 2).unwrap(); + assert_eq!(compact.precision(), 10); + assert_eq!(compact.scale(), 2); + assert!(compact.is_compact()); + assert_eq!(compact.to_unscaled_long().unwrap(), 12345); + assert_eq!(compact.to_big_decimal().to_string(), "123.45"); + + // Non-compact (precision > 18): from BigDecimal + let bd = BigDecimal::new(BigInt::from(12345), 0); + let non_compact = Decimal::from_big_decimal(bd, 28, 0).unwrap(); + assert_eq!(non_compact.precision(), 28); + assert!(!non_compact.is_compact()); + assert_eq!( + non_compact.to_unscaled_bytes(), + BigInt::from(12345).to_signed_bytes_be() + ); + + // Test compact boundary + assert!(Decimal::is_compact_precision(18)); + assert!(!Decimal::is_compact_precision(19)); + + // Test rounding during creation + let bd = BigDecimal::new(BigInt::from(12345), 3); // 12.345 + let rounded = Decimal::from_big_decimal(bd, 10, 2).unwrap(); + assert_eq!(rounded.to_unscaled_long().unwrap(), 1235); // 12.35 + } + + /// Test serialization round-trip (unscaled bytes) + #[test] + fn test_serialization_roundtrip() { + // Compact decimal + let bd1 = BigDecimal::new(BigInt::from(1314567890123i64), 5); // 13145678.90123 + let decimal1 = Decimal::from_big_decimal(bd1.clone(), 15, 5).unwrap(); + let (unscaled1, _) = bd1.as_bigint_and_exponent(); + let from_bytes1 = + Decimal::from_unscaled_bytes(&unscaled1.to_signed_bytes_be(), 15, 5).unwrap(); + assert_eq!(from_bytes1, decimal1); + assert_eq!( + from_bytes1.to_unscaled_bytes(), + unscaled1.to_signed_bytes_be() + ); + + // Non-compact decimal + let bd2 = BigDecimal::new(BigInt::from(12345678900987654321i128), 10); + let decimal2 = Decimal::from_big_decimal(bd2.clone(), 23, 10).unwrap(); + let (unscaled2, _) = bd2.as_bigint_and_exponent(); + let from_bytes2 = + Decimal::from_unscaled_bytes(&unscaled2.to_signed_bytes_be(), 23, 10).unwrap(); + assert_eq!(from_bytes2, decimal2); + assert_eq!( + from_bytes2.to_unscaled_bytes(), + unscaled2.to_signed_bytes_be() + ); + } + + /// Test numeric equality and ordering (matches Java semantics) + #[test] + fn test_equality_and_ordering() { + // Same value, different precision/scale → should be equal (numeric equality) + let d1 = Decimal::from_big_decimal(BigDecimal::new(BigInt::from(10), 1), 2, 1).unwrap(); // 1.0 + let d2 = Decimal::from_big_decimal(BigDecimal::new(BigInt::from(100), 2), 3, 2).unwrap(); // 1.00 + assert_eq!(d1, d2, "Numeric equality: 1.0 == 1.00"); + assert_eq!(d1.cmp(&d2), std::cmp::Ordering::Equal); + + // Test ordering with positive values + let small = Decimal::from_unscaled_long(10, 5, 0).unwrap(); + let large = Decimal::from_unscaled_long(15, 5, 0).unwrap(); + assert!(small < large); + assert_eq!(small.cmp(&large), std::cmp::Ordering::Less); + + // Test ordering with negative values + let negative_large = Decimal::from_unscaled_long(-10, 5, 0).unwrap(); // -10 + let negative_small = Decimal::from_unscaled_long(-15, 5, 0).unwrap(); // -15 + assert!(negative_small < negative_large); // -15 < -10 + assert_eq!( + negative_small.cmp(&negative_large), + std::cmp::Ordering::Less + ); + + // Test ordering with mixed positive and negative + let positive = Decimal::from_unscaled_long(5, 5, 0).unwrap(); + let negative = Decimal::from_unscaled_long(-5, 5, 0).unwrap(); + assert!(negative < positive); + assert_eq!(negative.cmp(&positive), std::cmp::Ordering::Less); + + // Test clone and round-trip equality + let original = Decimal::from_unscaled_long(10, 5, 0).unwrap(); + assert_eq!(original.clone(), original); + assert_eq!( + Decimal::from_unscaled_long(original.to_unscaled_long().unwrap(), 5, 0).unwrap(), + original + ); + } + + /// Test hash/equals contract (Rust implementation is correct, unlike Java) + #[test] + fn test_hash_equals_contract() { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let d1 = Decimal::from_big_decimal(BigDecimal::new(BigInt::from(10), 1), 2, 1).unwrap(); // 1.0 + let d2 = Decimal::from_big_decimal(BigDecimal::new(BigInt::from(100), 2), 3, 2).unwrap(); // 1.00 + + // Numeric equality + assert_eq!(d1, d2); + + // Hash contract: if a == b, then hash(a) == hash(b) + let mut hasher1 = DefaultHasher::new(); + d1.hash(&mut hasher1); + let hash1 = hasher1.finish(); + + let mut hasher2 = DefaultHasher::new(); + d2.hash(&mut hasher2); + let hash2 = hasher2.finish(); + + assert_eq!(hash1, hash2, "Equal decimals must have equal hashes"); + + // Verify HashMap works correctly (this would fail in Java due to their hash/equals bug) + let mut map = std::collections::HashMap::new(); + map.insert(d1.clone(), "value"); + assert_eq!(map.get(&d2), Some(&"value")); + } + + /// Test edge cases: zeros, large numbers, rescaling + #[test] + fn test_edge_cases() { + // Zero handling (compact and non-compact) + let zero_compact = Decimal::from_unscaled_long(0, 5, 2).unwrap(); + assert_eq!( + zero_compact.to_big_decimal(), + BigDecimal::new(BigInt::from(0), 2) + ); + + let zero_non_compact = + Decimal::from_big_decimal(BigDecimal::new(BigInt::from(0), 2), 20, 2).unwrap(); + assert_eq!( + zero_non_compact.to_big_decimal(), + BigDecimal::new(BigInt::from(0), 2) + ); + + // Large number (39 digits) + let large_bd = BigDecimal::from_str("123456789012345678901234567890123456789").unwrap(); + let large = Decimal::from_big_decimal(large_bd, 39, 0).unwrap(); + let double_val = large.to_big_decimal().to_string().parse::().unwrap(); + assert!((double_val - 1.2345678901234568E38).abs() < 0.01); + + // Rescaling: 5.0 (scale=1) → 5.00 (scale=2) + let d1 = Decimal::from_big_decimal(BigDecimal::new(BigInt::from(50), 1), 10, 1).unwrap(); + let d2 = Decimal::from_big_decimal(d1.to_big_decimal(), 10, 2).unwrap(); + assert_eq!(d2.to_big_decimal().to_string(), "5.00"); + assert_eq!(d2.scale(), 2); + } +} diff --git a/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs b/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs index ebe3da2a0a..563c1c960e 100644 --- a/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs +++ b/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs @@ -238,86 +238,121 @@ mod tests { } #[test] - fn test_all_data_types() { + fn test_all_data_types_java_compatible() { + // Test encoding compatibility with Java using reference from: + // https://github.com/apache/fluss/blob/main/fluss-common/src/test/resources/encoding/encoded_key.hex + use crate::metadata::{DataType, TimestampLTzType, TimestampType}; + let row_type = RowType::with_data_types(vec![ - DataTypes::boolean(), - DataTypes::tinyint(), - DataTypes::smallint(), - DataTypes::int(), - DataTypes::bigint(), - DataTypes::float(), - DataTypes::double(), - // TODO Date - // TODO Time - DataTypes::binary(20), - DataTypes::bytes(), - DataTypes::char(2), - DataTypes::string(), - // TODO Decimal - // TODO Timestamp - // TODO Timestamp LTZ - // TODO Array of Int - // TODO Array of Float - // TODO Array of String - // TODO: Add Map and Row fields in Issue #1973 + DataTypes::boolean(), // BOOLEAN + DataTypes::tinyint(), // TINYINT + DataTypes::smallint(), // SMALLINT + DataTypes::int(), // INT + DataTypes::bigint(), // BIGINT + DataTypes::float(), // FLOAT + DataTypes::double(), // DOUBLE + DataTypes::date(), // DATE + DataTypes::time(), // TIME + DataTypes::binary(20), // BINARY(20) + DataTypes::bytes(), // BYTES + DataTypes::char(2), // CHAR(2) + DataTypes::string(), // STRING + DataTypes::decimal(5, 2), // DECIMAL(5,2) + DataTypes::decimal(20, 0), // DECIMAL(20,0) + DataType::Timestamp(TimestampType::with_nullable(false, 1).unwrap()), // TIMESTAMP(1) + DataType::Timestamp(TimestampType::with_nullable(false, 5).unwrap()), // TIMESTAMP(5) + DataType::TimestampLTz(TimestampLTzType::with_nullable(false, 1).unwrap()), // TIMESTAMP_LTZ(1) + DataType::TimestampLTz(TimestampLTzType::with_nullable(false, 5).unwrap()), // TIMESTAMP_LTZ(5) + // TODO: Add support for ARRAY type + // TODO: Add support for MAP type + // TODO: Add support for ROW type ]); + // Exact values from Java's IndexedRowTest.genRecordForAllTypes() let row = GenericRow::from_data(vec![ - Datum::from(true), - Datum::from(2i8), - Datum::from(10i16), - Datum::from(100i32), - Datum::from(-6101065172474983726i64), // from Java test case: new BigInteger("12345678901234567890").longValue() - Datum::from(13.2f32), - Datum::from(15.21f64), - // TODO Date - // TODO Time - Datum::from("1234567890".as_bytes()), - Datum::from("20".as_bytes()), - Datum::from("1"), - Datum::from("hello"), - // TODO Decimal - // TODO Timestamp - // TODO Timestamp LTZ - // TODO Array of Int - // TODO Array of Float - // TODO Array of String - // TODO: Add Map and Row fields in Issue #1973 + Datum::from(true), // BOOLEAN: true + Datum::from(2i8), // TINYINT: 2 + Datum::from(10i16), // SMALLINT: 10 + Datum::from(100i32), // INT: 100 + Datum::from(-6101065172474983726i64), // BIGINT + Datum::from(13.2f32), // FLOAT: 13.2 + Datum::from(15.21f64), // DOUBLE: 15.21 + Datum::Date(crate::row::datum::Date::new(19655)), // DATE: 2023-10-25 (19655 days since epoch) + Datum::Time(crate::row::datum::Time::new(34200000)), // TIME: 09:30:00.0 + Datum::from("1234567890".as_bytes()), // BINARY(20) + Datum::from("20".as_bytes()), // BYTES + Datum::from("1"), // CHAR(2): "1" + Datum::from("hello"), // STRING: "hello" + Datum::Decimal(crate::row::Decimal::from_unscaled_long(9, 5, 2).unwrap()), // DECIMAL(5,2) + Datum::Decimal( + crate::row::Decimal::from_big_decimal( + bigdecimal::BigDecimal::new(bigdecimal::num_bigint::BigInt::from(10), 0), + 20, + 0, + ) + .unwrap(), + ), // DECIMAL(20,0) + Datum::TimestampNtz(crate::row::datum::TimestampNtz::new(1698235273182)), // TIMESTAMP(1) + Datum::TimestampNtz(crate::row::datum::TimestampNtz::new(1698235273182)), // TIMESTAMP(5) + Datum::TimestampLtz(crate::row::datum::TimestampLtz::new(1698235273182)), // TIMESTAMP_LTZ(1) + Datum::TimestampLtz(crate::row::datum::TimestampLtz::new(1698235273182)), // TIMESTAMP_LTZ(5) ]); - let mut encoder = for_test_row_type(&row_type); - - let mut expected: Vec = Vec::new(); - // BOOLEAN: true - expected.extend(vec![0x01]); - // TINYINT: 2 - expected.extend(vec![0x02]); - // SMALLINT: 10 - expected.extend(vec![0x0A]); - // INT: 100 - expected.extend(vec![0x00, 0x64]); - // BIGINT: -6101065172474983726 - expected.extend(vec![ + // Expected bytes from Java's encoded_key.hex reference file + #[rustfmt::skip] + let expected: Vec = vec![ + // BOOLEAN: true + 0x01, + // TINYINT: 2 + 0x02, + // SMALLINT: 10 (varint encoded) + 0x0A, + // INT: 100 (varint encoded) + 0x00, 0x64, + // BIGINT: -6101065172474983726 0xD2, 0x95, 0xFC, 0xD8, 0xCE, 0xB1, 0xAA, 0xAA, 0xAB, 0x01, - ]); - // FLOAT: 13.2 - expected.extend(vec![0x33, 0x33, 0x53, 0x41]); - // DOUBLE: 15.21 - expected.extend(vec![0xEC, 0x51, 0xB8, 0x1E, 0x85, 0x6B, 0x2E, 0x40]); - // BINARY(20): "1234567890".getBytes() - expected.extend(vec![ + // FLOAT: 13.2 + 0x33, 0x33, 0x53, 0x41, + // DOUBLE: 15.21 + 0xEC, 0x51, 0xB8, 0x1E, 0x85, 0x6B, 0x2E, 0x40, + // DATE: 2023-10-25 + 0xC7, 0x99, 0x01, + // TIME: 09:30:00.0 + 0xC0, 0xB3, 0xA7, 0x10, + // BINARY(20): "1234567890" 0x0A, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, - ]); + // BYTES: "20" + 0x02, 0x32, 0x30, + // CHAR(2): "1" + 0x01, 0x31, + // STRING: "hello" + 0x05, 0x68, 0x65, 0x6C, 0x6C, 0x6F, + // DECIMAL(5,2): 9 + 0x09, + // DECIMAL(20,0): 10 + 0x01, 0x0A, + // TIMESTAMP(1): 1698235273182 + 0xDE, 0x9F, 0xD7, 0xB5, 0xB6, 0x31, + // TIMESTAMP(5): 1698235273182 + 0xDE, 0x9F, 0xD7, 0xB5, 0xB6, 0x31, 0x00, + // TIMESTAMP_LTZ(1): 1698235273182 + 0xDE, 0x9F, 0xD7, 0xB5, 0xB6, 0x31, + // TIMESTAMP_LTZ(5): 1698235273182 + 0xDE, 0x9F, 0xD7, 0xB5, 0xB6, 0x31, 0x00, + ]; - // BYTES: "20".getBytes() - expected.extend(vec![0x02, 0x32, 0x30]); - // CHAR(2): "1" - expected.extend(vec![0x01, 0x31]); - // STRING: String: "hello" - expected.extend(vec![0x05, 0x68, 0x65, 0x6C, 0x6C, 0x6F]); + let mut encoder = for_test_row_type(&row_type); + let encoded = encoder.encode_key(&row).unwrap(); + + // Assert byte-for-byte compatibility with Java's encoded_key.hex assert_eq!( - encoder.encode_key(&row).unwrap().iter().as_slice(), - expected.as_slice() + encoded.iter().as_slice(), + expected.as_slice(), + "\n\nRust encoding does not match Java reference from encoded_key.hex\n\ + Expected: {:02X?}\n\ + Actual: {:02X?}\n", + expected, + encoded.iter().as_slice() ); } } diff --git a/fluss-rust/crates/fluss/src/row/field_getter.rs b/fluss-rust/crates/fluss/src/row/field_getter.rs index 97f9e395fc..cbffa4d09f 100644 --- a/fluss-rust/crates/fluss/src/row/field_getter.rs +++ b/fluss-rust/crates/fluss/src/row/field_getter.rs @@ -66,6 +66,21 @@ impl FieldGetter { DataType::BigInt(_) => InnerFieldGetter::BigInt { pos }, DataType::Float(_) => InnerFieldGetter::Float { pos }, DataType::Double(_) => InnerFieldGetter::Double { pos }, + DataType::Decimal(decimal_type) => InnerFieldGetter::Decimal { + pos, + precision: decimal_type.precision() as usize, + scale: decimal_type.scale() as usize, + }, + DataType::Date(_) => InnerFieldGetter::Date { pos }, + DataType::Time(_) => InnerFieldGetter::Time { pos }, + DataType::Timestamp(t) => InnerFieldGetter::Timestamp { + pos, + precision: t.precision(), + }, + DataType::TimestampLTz(t) => InnerFieldGetter::TimestampLtz { + pos, + precision: t.precision(), + }, _ => unimplemented!("DataType {:?} is currently unimplemented", data_type), }; @@ -79,17 +94,60 @@ impl FieldGetter { #[derive(Clone)] pub enum InnerFieldGetter { - Char { pos: usize, len: usize }, - String { pos: usize }, - Bool { pos: usize }, - Binary { pos: usize, len: usize }, - Bytes { pos: usize }, - TinyInt { pos: usize }, - SmallInt { pos: usize }, - Int { pos: usize }, - BigInt { pos: usize }, - Float { pos: usize }, - Double { pos: usize }, + Char { + pos: usize, + len: usize, + }, + String { + pos: usize, + }, + Bool { + pos: usize, + }, + Binary { + pos: usize, + len: usize, + }, + Bytes { + pos: usize, + }, + TinyInt { + pos: usize, + }, + SmallInt { + pos: usize, + }, + Int { + pos: usize, + }, + BigInt { + pos: usize, + }, + Float { + pos: usize, + }, + Double { + pos: usize, + }, + Decimal { + pos: usize, + precision: usize, + scale: usize, + }, + Date { + pos: usize, + }, + Time { + pos: usize, + }, + Timestamp { + pos: usize, + precision: u32, + }, + TimestampLtz { + pos: usize, + precision: u32, + }, } impl InnerFieldGetter { @@ -106,7 +164,19 @@ impl InnerFieldGetter { InnerFieldGetter::BigInt { pos } => Datum::from(row.get_long(*pos)), InnerFieldGetter::Float { pos } => Datum::from(row.get_float(*pos)), InnerFieldGetter::Double { pos } => Datum::from(row.get_double(*pos)), - //TODO Decimal, Date, Time, Timestamp, TimestampLTZ, Array, Map, Row + InnerFieldGetter::Decimal { + pos, + precision, + scale, + } => Datum::Decimal(row.get_decimal(*pos, *precision, *scale)), + InnerFieldGetter::Date { pos } => Datum::Date(row.get_date(*pos)), + InnerFieldGetter::Time { pos } => Datum::Time(row.get_time(*pos)), + InnerFieldGetter::Timestamp { pos, precision } => { + Datum::TimestampNtz(row.get_timestamp_ntz(*pos, *precision)) + } + InnerFieldGetter::TimestampLtz { pos, precision } => { + Datum::TimestampLtz(row.get_timestamp_ltz(*pos, *precision)) + } //TODO Array, Map, Row } } @@ -122,7 +192,12 @@ impl InnerFieldGetter { | Self::Int { pos } | Self::BigInt { pos } | Self::Float { pos, .. } - | Self::Double { pos } => *pos, + | Self::Double { pos } + | Self::Decimal { pos, .. } + | Self::Date { pos } + | Self::Time { pos } + | Self::Timestamp { pos, .. } + | Self::TimestampLtz { pos, .. } => *pos, } } } diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index 536409efde..d2f640e4dc 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -18,6 +18,7 @@ mod column; mod datum; +mod decimal; pub mod binary; pub mod compacted; @@ -28,6 +29,7 @@ mod row_decoder; pub use column::*; pub use compacted::CompactedRow; pub use datum::*; +pub use decimal::{Decimal, MAX_COMPACT_PRECISION}; pub use encode::KeyEncoder; pub use row_decoder::{CompactedRowDecoder, RowDecoder, RowDecoderFactory}; @@ -71,14 +73,26 @@ pub trait InternalRow { /// Returns the string value at the given position fn get_string(&self, pos: usize) -> &str; - // /// Returns the decimal value at the given position - // fn get_decimal(&self, pos: usize, precision: usize, scale: usize) -> Decimal; + /// Returns the decimal value at the given position + fn get_decimal(&self, pos: usize, precision: usize, scale: usize) -> Decimal; - // /// Returns the timestamp value at the given position - // fn get_timestamp_ntz(&self, pos: usize, precision: usize) -> TimestampNtz; + /// Returns the date value at the given position (date as days since epoch) + fn get_date(&self, pos: usize) -> datum::Date; - // /// Returns the timestamp value at the given position - // fn get_timestamp_ltz(&self, pos: usize, precision: usize) -> TimestampLtz; + /// Returns the time value at the given position (time as milliseconds since midnight) + fn get_time(&self, pos: usize) -> datum::Time; + + /// Returns the timestamp value at the given position (timestamp without timezone) + /// + /// The precision is required to determine whether the timestamp value was stored + /// in a compact representation (precision <= 3) or with nanosecond precision. + fn get_timestamp_ntz(&self, pos: usize, precision: u32) -> datum::TimestampNtz; + + /// Returns the timestamp value at the given position (timestamp with local timezone) + /// + /// The precision is required to determine whether the timestamp value was stored + /// in a compact representation (precision <= 3) or with nanosecond precision. + fn get_timestamp_ltz(&self, pos: usize, precision: u32) -> datum::TimestampLtz; /// Returns the binary value at the given position with fixed length fn get_binary(&self, pos: usize, length: usize) -> &[u8]; @@ -123,6 +137,43 @@ impl<'a> InternalRow for GenericRow<'a> { self.values.get(_pos).unwrap().try_into().unwrap() } + fn get_decimal(&self, pos: usize, _precision: usize, _scale: usize) -> Decimal { + match self.values.get(pos).unwrap() { + Datum::Decimal(d) => d.clone(), + other => panic!("Expected Decimal at pos {pos:?}, got {other:?}"), + } + } + + fn get_date(&self, pos: usize) -> datum::Date { + match self.values.get(pos).unwrap() { + Datum::Date(d) => *d, + Datum::Int32(i) => datum::Date::new(*i), + other => panic!("Expected Date or Int32 at pos {pos:?}, got {other:?}"), + } + } + + fn get_time(&self, pos: usize) -> datum::Time { + match self.values.get(pos).unwrap() { + Datum::Time(t) => *t, + Datum::Int32(i) => datum::Time::new(*i), + other => panic!("Expected Time or Int32 at pos {pos:?}, got {other:?}"), + } + } + + fn get_timestamp_ntz(&self, pos: usize, _precision: u32) -> datum::TimestampNtz { + match self.values.get(pos).unwrap() { + Datum::TimestampNtz(t) => *t, + other => panic!("Expected TimestampNtz at pos {pos:?}, got {other:?}"), + } + } + + fn get_timestamp_ltz(&self, pos: usize, _precision: u32) -> datum::TimestampLtz { + match self.values.get(pos).unwrap() { + Datum::TimestampLtz(t) => *t, + other => panic!("Expected TimestampLtz at pos {pos:?}, got {other:?}"), + } + } + fn get_float(&self, pos: usize) -> f32 { self.values.get(pos).unwrap().try_into().unwrap() } From 0e31ef49c0939deb6890bb7c045816001b7eef20 Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Tue, 20 Jan 2026 13:09:49 +0000 Subject: [PATCH 076/287] feat: Introduce UpsertWriter (#169) --- fluss-rust/crates/fluss/Cargo.toml | 1 + .../src/client/table/log_fetch_buffer.rs | 6 +- .../crates/fluss/src/client/table/lookup.rs | 9 +- .../crates/fluss/src/client/table/mod.rs | 18 + .../src/client/table/partition_getter.rs | 56 ++ .../crates/fluss/src/client/table/upsert.rs | 522 ++++++++++++++++++ .../crates/fluss/src/client/table/writer.rs | 20 +- .../fluss/src/client/write/accumulator.rs | 39 +- .../crates/fluss/src/client/write/batch.rs | 20 +- .../fluss/src/client/write/bucket_assigner.rs | 9 +- .../crates/fluss/src/client/write/mod.rs | 54 +- .../fluss/src/client/write/write_format.rs | 1 + .../fluss/src/client/write/writer_client.rs | 9 +- fluss-rust/crates/fluss/src/metadata/table.rs | 66 ++- fluss-rust/crates/fluss/src/record/arrow.rs | 62 +-- .../fluss/src/record/kv/kv_record_batch.rs | 11 +- .../src/record/kv/kv_record_batch_builder.rs | 80 +-- .../src/record/kv/kv_record_read_context.rs | 17 +- .../fluss/src/row/compacted/compacted_row.rs | 15 +- .../src/row/compacted/compacted_row_writer.rs | 12 + .../src/row/encode/compacted_row_encoder.rs | 13 +- fluss-rust/crates/fluss/src/row/encode/mod.rs | 23 +- fluss-rust/crates/fluss/src/row/mod.rs | 27 +- 23 files changed, 895 insertions(+), 195 deletions(-) create mode 100644 fluss-rust/crates/fluss/src/client/table/partition_getter.rs create mode 100644 fluss-rust/crates/fluss/src/client/table/upsert.rs diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml index c3bdd4475a..9aeee72db4 100644 --- a/fluss-rust/crates/fluss/Cargo.toml +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -33,6 +33,7 @@ integration_tests = [] [dependencies] arrow = { workspace = true } arrow-schema = "57.0.0" +bitvec = "1" byteorder = "1.5" futures = "0.3" clap = { workspace = true } diff --git a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs index ac44cc1728..ca0a2532c8 100644 --- a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs +++ b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs @@ -651,14 +651,14 @@ mod tests { use crate::compression::{ ArrowCompressionInfo, ArrowCompressionType, DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, }; - use crate::metadata::{DataField, DataTypes, TablePath}; + use crate::metadata::{DataField, DataTypes, RowType, TablePath}; use crate::record::{MemoryLogRecordsArrowBuilder, ReadContext, to_arrow_schema}; use crate::row::GenericRow; use std::sync::Arc; use std::time::Duration; fn test_read_context() -> ReadContext { - let row_type = DataTypes::row(vec![DataField::new( + let row_type = RowType::new(vec![DataField::new( "id".to_string(), DataTypes::int(), None, @@ -714,7 +714,7 @@ mod tests { #[test] fn default_completed_fetch_reads_records() -> Result<()> { - let row_type = DataTypes::row(vec![ + let row_type = RowType::new(vec![ DataField::new("id".to_string(), DataTypes::int(), None), DataField::new("name".to_string(), DataTypes::string(), None), ]); diff --git a/fluss-rust/crates/fluss/src/client/table/lookup.rs b/fluss-rust/crates/fluss/src/client/table/lookup.rs index 1d32ebd75e..cd23503a5b 100644 --- a/fluss-rust/crates/fluss/src/client/table/lookup.rs +++ b/fluss-rust/crates/fluss/src/client/table/lookup.rs @@ -22,7 +22,7 @@ use crate::error::{Error, Result}; use crate::metadata::{RowType, TableBucket, TableInfo}; use crate::row::InternalRow; use crate::row::compacted::CompactedRow; -use crate::row::encode::KeyEncoder; +use crate::row::encode::{KeyEncoder, KeyEncoderFactory}; use crate::rpc::ApiError; use crate::rpc::message::LookupRequest; use std::sync::Arc; @@ -130,8 +130,11 @@ impl<'a> TableLookup<'a> { // Create key encoder for the primary key fields let pk_fields = self.table_info.get_physical_primary_keys().to_vec(); - let key_encoder = - ::of(self.table_info.row_type(), pk_fields, data_lake_format)?; + let key_encoder = KeyEncoderFactory::of( + self.table_info.row_type(), + pk_fields.as_slice(), + &data_lake_format, + )?; Ok(Lookuper { conn: self.conn, diff --git a/fluss-rust/crates/fluss/src/client/table/mod.rs b/fluss-rust/crates/fluss/src/client/table/mod.rs index 7356be2393..2bfa054105 100644 --- a/fluss-rust/crates/fluss/src/client/table/mod.rs +++ b/fluss-rust/crates/fluss/src/client/table/mod.rs @@ -27,13 +27,17 @@ mod append; mod lookup; mod log_fetch_buffer; +mod partition_getter; mod remote_log; mod scanner; +mod upsert; mod writer; +use crate::client::table::upsert::TableUpsert; pub use append::{AppendWriter, TableAppend}; pub use lookup::{LookupResult, Lookuper, TableLookup}; pub use scanner::{LogScanner, RecordBatchLogScanner, TableScan}; +pub use writer::{TableWriter, UpsertWriter}; #[allow(dead_code)] pub struct FlussTable<'a> { @@ -119,6 +123,20 @@ impl<'a> FlussTable<'a> { self.metadata.clone(), )) } + + pub fn new_upsert(&self) -> Result { + if !self.has_primary_key { + return Err(Error::UnsupportedOperation { + message: "Upsert is only supported for primary key tables".to_string(), + }); + } + + Ok(TableUpsert::new( + self.table_path.clone(), + self.table_info.clone(), + self.conn.get_or_create_writer_client()?, + )) + } } impl<'a> Drop for FlussTable<'a> { diff --git a/fluss-rust/crates/fluss/src/client/table/partition_getter.rs b/fluss-rust/crates/fluss/src/client/table/partition_getter.rs new file mode 100644 index 0000000000..4529d8684f --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/partition_getter.rs @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::{DataType, RowType}; +use crate::row::field_getter::FieldGetter; + +#[allow(dead_code)] +pub struct PartitionGetter<'a> { + partitions: Vec<(&'a String, &'a DataType, FieldGetter)>, +} + +#[allow(dead_code)] +impl<'a> PartitionGetter<'a> { + pub fn new(row_type: &'a RowType, partition_keys: &'a Vec) -> Result { + let mut partitions = Vec::with_capacity(partition_keys.len()); + + for partition_key in partition_keys { + if let Some(partition_col_index) = row_type.get_field_index(partition_key.as_str()) { + let data_type = &row_type + .fields() + .get(partition_col_index) + .unwrap() + .data_type; + let field_getter = FieldGetter::create(data_type, partition_col_index); + + partitions.push((partition_key, data_type, field_getter)); + } else { + return Err(IllegalArgument { + message: format!( + "The partition column {partition_key} is not in the row {row_type}." + ), + }); + }; + } + + Ok(Self { partitions }) + } + + // TODO Implement get partition +} diff --git a/fluss-rust/crates/fluss/src/client/table/upsert.rs b/fluss-rust/crates/fluss/src/client/table/upsert.rs new file mode 100644 index 0000000000..a3909e7258 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/upsert.rs @@ -0,0 +1,522 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::table::writer::{DeleteResult, TableWriter, UpsertResult, UpsertWriter}; +use crate::client::{RowBytes, WriteFormat, WriteRecord, WriterClient}; +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::{KvFormat, RowType, TableInfo, TablePath}; +use crate::row::InternalRow; +use crate::row::encode::{KeyEncoder, KeyEncoderFactory, RowEncoder, RowEncoderFactory}; +use crate::row::field_getter::FieldGetter; +use std::sync::Arc; + +use bitvec::prelude::bitvec; +use bytes::Bytes; + +#[allow(dead_code)] +pub struct TableUpsert { + table_path: TablePath, + table_info: TableInfo, + writer_client: Arc, + target_columns: Option>>, +} + +#[allow(dead_code)] +impl TableUpsert { + pub fn new( + table_path: TablePath, + table_info: TableInfo, + writer_client: Arc, + ) -> Self { + Self { + table_path, + table_info, + writer_client, + target_columns: None, + } + } + + pub fn partial_update(&self, target_columns: Option>) -> Result { + if let Some(columns) = &target_columns { + let num_columns = self.table_info.row_type().fields().len(); + + if let Some(&invalid_column) = columns.iter().find(|&&col| col >= num_columns) { + return Err(IllegalArgument { + message: format!( + "Invalid target column index: {invalid_column} for table {}. The table only has {num_columns} columns.", + self.table_path + ), + }); + } + } + + Ok(Self { + table_path: self.table_path.clone(), + table_info: self.table_info.clone(), + writer_client: self.writer_client.clone(), + target_columns: target_columns.map(Arc::new), + }) + } + + pub fn partial_update_with_column_names(&self, target_column_names: &[&str]) -> Result { + let row_type = self.table_info.row_type(); + let col_indices: Vec<(&str, Option)> = target_column_names + .iter() + .map(|col_name| (*col_name, row_type.get_field_index(col_name))) + .collect(); + + if let Some((missing_name, _)) = col_indices.iter().find(|(_, ix)| ix.is_none()) { + return Err(IllegalArgument { + message: format!( + "Cannot find target column `{}` for table {}.", + missing_name, self.table_path + ), + }); + } + + let valid_col_indices: Vec = col_indices + .into_iter() + .map(|(_, index)| index.unwrap()) + .collect(); + + self.partial_update(Some(valid_col_indices)) + } + + pub fn create_writer(&self) -> Result { + UpsertWriterFactory::create( + Arc::new(self.table_path.clone()), + Arc::new(self.table_info.clone()), + self.target_columns.clone(), + Arc::clone(&self.writer_client), + ) + } +} + +#[allow(dead_code)] +struct UpsertWriterImpl +where + RE: RowEncoder, +{ + table_path: Arc, + writer_client: Arc, + // TODO: Partitioning + // partition_field_getter: Option>, + primary_key_encoder: Box, + target_columns: Option>>, + // Use primary key encoder as bucket key encoder when None + bucket_key_encoder: Option>, + kv_format: KvFormat, + write_format: WriteFormat, + row_encoder: RE, + field_getters: Box<[FieldGetter]>, + table_info: Arc, +} + +#[allow(dead_code)] +struct UpsertWriterFactory; + +#[allow(dead_code)] +impl UpsertWriterFactory { + pub fn create( + table_path: Arc, + table_info: Arc, + partial_update_columns: Option>>, + writer_client: Arc, + ) -> Result { + let data_lake_format = &table_info.table_config.get_datalake_format()?; + let row_type = table_info.row_type(); + let physical_pks = table_info.get_physical_primary_keys(); + + let names = table_info.get_schema().auto_increment_col_names(); + + Self::sanity_check( + row_type, + &table_info.primary_keys, + names, + &partial_update_columns, + )?; + + let primary_key_encoder = KeyEncoderFactory::of(row_type, physical_pks, data_lake_format)?; + let bucket_key_encoder = if !table_info.is_default_bucket_key() { + Some(KeyEncoderFactory::of( + row_type, + table_info.get_bucket_keys(), + data_lake_format, + )?) + } else { + // Defaults to using primary key encoder when None for bucket key + None + }; + + let kv_format = table_info.get_table_config().get_kv_format()?; + let write_format = WriteFormat::from_kv_format(&kv_format)?; + + let field_getters = FieldGetter::create_field_getters(row_type); + + Ok(UpsertWriterImpl { + table_path, + writer_client, + primary_key_encoder, + target_columns: partial_update_columns, + bucket_key_encoder, + kv_format: kv_format.clone(), + write_format, + row_encoder: RowEncoderFactory::create(kv_format, row_type.clone())?, + field_getters, + table_info: table_info.clone(), + }) + } + + #[allow(dead_code)] + fn sanity_check( + row_type: &RowType, + primary_keys: &Vec, + auto_increment_col_names: &Vec, + target_columns: &Option>>, + ) -> Result<()> { + if target_columns.is_none() { + if !auto_increment_col_names.is_empty() { + return Err(IllegalArgument { + message: format!( + "This table has auto increment column {}. Explicitly specifying values for an auto increment column is not allowed. Please Specify non-auto-increment columns as target columns using partialUpdate first.", + auto_increment_col_names.join(", ") + ), + }); + } + return Ok(()); + } + + let field_count = row_type.fields().len(); + + let mut target_column_set = bitvec![0; field_count]; + + let columns = target_columns.as_ref().unwrap().as_ref(); + + for &target_index in columns { + target_column_set.set(target_index, true); + } + + let mut pk_column_set = bitvec![0; field_count]; + + // check the target columns contains the primary key + for primary_key in primary_keys { + let pk_index = row_type.get_field_index(primary_key.as_str()); + match pk_index { + Some(pk_index) => { + if !target_column_set[pk_index] { + return Err(IllegalArgument { + message: format!( + "The target write columns {} must contain the primary key columns {}", + row_type.project(columns)?.get_field_names().join(", "), + primary_keys.join(", ") + ), + }); + } + pk_column_set.set(pk_index, true); + } + None => { + return Err(IllegalArgument { + message: format!( + "The specified primary key {} is not in row type {}", + primary_key, row_type + ), + }); + } + } + } + + let mut auto_increment_column_set = bitvec![0; field_count]; + // explicitly specifying values for an auto increment column is not allowed + for auto_increment_col_name in auto_increment_col_names { + let auto_increment_field_index = + row_type.get_field_index(auto_increment_col_name.as_str()); + + if let Some(index) = auto_increment_field_index { + if target_column_set[index] { + return Err(IllegalArgument { + message: format!( + "Explicitly specifying values for the auto increment column {} is not allowed.", + auto_increment_col_name + ), + }); + } + + auto_increment_column_set.set(index, true); + } + } + + // check the columns not in targetColumns should be nullable + for i in 0..field_count { + // column not in primary key and not in auto increment column + if !pk_column_set[i] && !auto_increment_column_set[i] { + // the column should be nullable + if !row_type.fields().get(i).unwrap().data_type.is_nullable() { + return Err(IllegalArgument { + message: format!( + "Partial Update requires all columns except primary key to be nullable, but column {} is NOT NULL.", + row_type.fields().get(i).unwrap().name() + ), + }); + } + } + } + + Ok(()) + } +} + +#[allow(dead_code)] +impl UpsertWriterImpl { + fn check_field_count(&self, row: &R) -> Result<()> { + let expected = self.table_info.get_row_type().fields().len(); + if row.get_field_count() != expected { + return Err(IllegalArgument { + message: format!( + "The field count of the row does not match the table schema. Expected: {}, Actual: {}", + expected, + row.get_field_count() + ), + }); + } + Ok(()) + } + + fn get_keys(&mut self, row: &dyn InternalRow) -> Result<(Bytes, Option)> { + let key = self.primary_key_encoder.encode_key(row)?; + let bucket_key = match &mut self.bucket_key_encoder { + Some(bucket_key_encoder) => Some(bucket_key_encoder.encode_key(row)?), + None => Some(key.clone()), + }; + Ok((key, bucket_key)) + } + + fn encode_row(&mut self, row: &R) -> Result { + self.row_encoder.start_new_row()?; + for (pos, field_getter) in self.field_getters.iter().enumerate() { + let datum = field_getter.get_field(row); + self.row_encoder.encode_field(pos, datum)?; + } + self.row_encoder.finish_row() + } +} + +impl TableWriter for UpsertWriterImpl { + /// Flush data written that have not yet been sent to the server, forcing the client to send the + /// requests to server and blocks on the completion of the requests associated with these + /// records. A request is considered completed when it is successfully acknowledged according to + /// the CLIENT_WRITER_ACKS configuration option you have specified or else it + /// results in an error. + async fn flush(&self) -> Result<()> { + self.writer_client.flush().await + } +} + +impl UpsertWriter for UpsertWriterImpl { + /// Inserts row into Fluss table if they do not already exist, or updates them if they do exist. + /// + /// # Arguments + /// * row - the row to upsert. + /// + /// # Returns + /// Ok(UpsertResult) when completed normally + async fn upsert(&mut self, row: &R) -> Result { + self.check_field_count(row)?; + + let (key, bucket_key) = self.get_keys(row)?; + + let row_bytes: RowBytes<'_> = match row.as_encoded_bytes(self.write_format) { + Some(bytes) => RowBytes::Borrowed(bytes), + None => RowBytes::Owned(self.encode_row(row)?), + }; + + let write_record = WriteRecord::for_upsert( + Arc::clone(&self.table_path), + self.table_info.schema_id, + key, + bucket_key, + self.write_format, + self.target_columns.clone(), + Some(row_bytes), + ); + + let result_handle = self.writer_client.send(&write_record).await?; + let result = result_handle.wait().await?; + + result_handle.result(result).map(|_| UpsertResult) + } + + /// Delete certain row by the input row in Fluss table, the input row must contain the primary + /// key. + /// + /// # Arguments + /// * row - the row to delete. + /// + /// # Returns + /// Ok(DeleteResult) when completed normally + async fn delete(&mut self, row: &R) -> Result { + self.check_field_count(row)?; + + let (key, bucket_key) = self.get_keys(row)?; + + let write_record = WriteRecord::for_upsert( + Arc::clone(&self.table_path), + self.table_info.schema_id, + key, + bucket_key, + self.write_format, + self.target_columns.clone(), + None, + ); + + let result_handle = self.writer_client.send(&write_record).await?; + let result = result_handle.wait().await?; + + result_handle.result(result).map(|_| DeleteResult) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{DataField, DataTypes}; + + #[test] + fn sanity_check() { + // No target columns specified but table has auto-increment column + let fields = vec![ + DataField::new("id".to_string(), DataTypes::int().as_non_nullable(), None), + DataField::new("name".to_string(), DataTypes::string(), None), + ]; + let row_type = RowType::new(fields); + let primary_keys = vec!["id".to_string()]; + let auto_increment_col_names = vec!["id".to_string()]; + let target_columns = None; + + let result = UpsertWriterFactory::sanity_check( + &row_type, + &primary_keys, + &auto_increment_col_names, + &target_columns, + ); + + assert!(result.unwrap_err().to_string().contains( + "This table has auto increment column id. Explicitly specifying values for an auto increment column is not allowed. Please Specify non-auto-increment columns as target columns using partialUpdate first." + )); + + // Target columns do not contain primary key + let fields = vec![ + DataField::new("id".to_string(), DataTypes::int().as_non_nullable(), None), + DataField::new("name".to_string(), DataTypes::string(), None), + DataField::new("value".to_string(), DataTypes::int(), None), + ]; + let row_type = RowType::new(fields); + let primary_keys = vec!["id".to_string()]; + let auto_increment_col_names = vec![]; + let target_columns = Some(Arc::new(vec![1usize])); + + let result = UpsertWriterFactory::sanity_check( + &row_type, + &primary_keys, + &auto_increment_col_names, + &target_columns, + ); + + assert!( + result + .unwrap_err() + .to_string() + .contains("The target write columns name must contain the primary key columns id") + ); + + // Primary key column not found in row type + let fields = vec![ + DataField::new("id".to_string(), DataTypes::int().as_non_nullable(), None), + DataField::new("name".to_string(), DataTypes::string(), None), + ]; + let row_type = RowType::new(fields); + let primary_keys = vec!["nonexistent_pk".to_string()]; + let auto_increment_col_names = vec![]; + let target_columns = Some(Arc::new(vec![0usize, 1])); + + let result = UpsertWriterFactory::sanity_check( + &row_type, + &primary_keys, + &auto_increment_col_names, + &target_columns, + ); + + assert!( + result + .unwrap_err() + .to_string() + .contains("The specified primary key nonexistent_pk is not in row type") + ); + + // Target columns include auto-increment column + let fields = vec![ + DataField::new("id".to_string(), DataTypes::int().as_non_nullable(), None), + DataField::new( + "seq".to_string(), + DataTypes::bigint().as_non_nullable(), + None, + ), + DataField::new("name".to_string(), DataTypes::string(), None), + ]; + let row_type = RowType::new(fields); + let primary_keys = vec!["id".to_string()]; + let auto_increment_col_names = vec!["seq".to_string()]; + let target_columns = Some(Arc::new(vec![0usize, 1, 2])); + + let result = UpsertWriterFactory::sanity_check( + &row_type, + &primary_keys, + &auto_increment_col_names, + &target_columns, + ); + + assert!(result.unwrap_err().to_string().contains( + "Explicitly specifying values for the auto increment column seq is not allowed." + )); + + // Non-nullable column not in target columns (partial update requires nullable) + let fields = vec![ + DataField::new("id".to_string(), DataTypes::int().as_non_nullable(), None), + DataField::new( + "required_field".to_string(), + DataTypes::string().as_non_nullable(), + None, + ), + DataField::new("optional_field".to_string(), DataTypes::int(), None), + ]; + let row_type = RowType::new(fields); + let primary_keys = vec!["id".to_string()]; + let auto_increment_col_names = vec![]; + let target_columns = Some(Arc::new(vec![0usize])); + + let result = UpsertWriterFactory::sanity_check( + &row_type, + &primary_keys, + &auto_increment_col_names, + &target_columns, + ); + + assert!(result.unwrap_err().to_string().contains( + "Partial Update requires all columns except primary key to be nullable, but column required_field is NOT NULL." + )); + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/writer.rs b/fluss-rust/crates/fluss/src/client/table/writer.rs index 8a83b5e356..8276545703 100644 --- a/fluss-rust/crates/fluss/src/client/table/writer.rs +++ b/fluss-rust/crates/fluss/src/client/table/writer.rs @@ -16,13 +16,13 @@ // under the License. use crate::client::{WriteRecord, WriterClient}; -use crate::row::GenericRow; +use crate::row::{GenericRow, InternalRow}; use std::sync::Arc; use crate::error::Result; use crate::metadata::{TableInfo, TablePath}; -#[allow(dead_code)] +#[allow(dead_code, async_fn_in_trait)] pub trait TableWriter { async fn flush(&self) -> Result<()>; } @@ -32,12 +32,22 @@ pub trait AppendWriter: TableWriter { async fn append(&self, row: GenericRow) -> Result<()>; } -#[allow(dead_code)] +#[allow(dead_code, async_fn_in_trait)] pub trait UpsertWriter: TableWriter { - async fn upsert(&self, row: GenericRow) -> Result<()>; - async fn delete(&self, row: GenericRow) -> Result<()>; + async fn upsert(&mut self, row: &R) -> Result; + async fn delete(&mut self, row: &R) -> Result; } +/// The result of upserting a record +/// Currently this is an empty struct to allow for compatible evolution in the future +#[derive(Default)] +pub struct UpsertResult; + +/// The result of deleting a record +/// Currently this is an empty struct to allow for compatible evolution in the future +#[derive(Default)] +pub struct DeleteResult; + #[allow(dead_code)] pub struct AbstractTableWriter { table_path: Arc, diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs index 0afc9d4bc1..fb7b54470d 100644 --- a/fluss-rust/crates/fluss/src/client/write/accumulator.rs +++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::client::write::batch::WriteBatch::ArrowLog; -use crate::client::write::batch::{ArrowLogWriteBatch, WriteBatch}; +use crate::client::write::batch::WriteBatch::{ArrowLog, Kv}; +use crate::client::write::batch::{ArrowLogWriteBatch, KvWriteBatch, WriteBatch}; use crate::client::{LogWriteRecord, Record, ResultHandle, WriteRecord}; use crate::cluster::{BucketLocation, Cluster, ServerNode}; use crate::config::Config; @@ -102,16 +102,29 @@ impl RecordAccumulator { let schema_id = table_info.schema_id; - let mut batch = ArrowLog(ArrowLogWriteBatch::new( - self.batch_id.fetch_add(1, Ordering::Relaxed), - table_path.as_ref().clone(), - schema_id, - arrow_compression_info, - row_type, - bucket_id, - current_time_ms(), - matches!(&record.record, Record::Log(LogWriteRecord::RecordBatch(_))), - )); + let mut batch: WriteBatch = match record.record() { + Record::Log(_) => ArrowLog(ArrowLogWriteBatch::new( + self.batch_id.fetch_add(1, Ordering::Relaxed), + table_path.as_ref().clone(), + schema_id, + arrow_compression_info, + row_type, + bucket_id, + current_time_ms(), + matches!(&record.record, Record::Log(LogWriteRecord::RecordBatch(_))), + )), + Record::Kv(kv_record) => Kv(KvWriteBatch::new( + self.batch_id.fetch_add(1, Ordering::Relaxed), + table_path.as_ref().clone(), + schema_id, + // TODO: Decide how to derive write limit in the absence of java's equivalent of PreAllocatedPagedOutputView + KvWriteBatch::DEFAULT_WRITE_LIMIT, + record.write_format.to_kv_format()?, + bucket_id, + kv_record.target_columns.clone(), + current_time_ms(), + )), + }; let batch_id = batch.batch_id(); @@ -142,6 +155,8 @@ impl RecordAccumulator { ) -> Result { let table_path = &record.table_path; + // TODO: Implement partitioning + let dq = { let mut binding = self .write_batches diff --git a/fluss-rust/crates/fluss/src/client/write/batch.rs b/fluss-rust/crates/fluss/src/client/write/batch.rs index 01597538c8..2ddf5192ed 100644 --- a/fluss-rust/crates/fluss/src/client/write/batch.rs +++ b/fluss-rust/crates/fluss/src/client/write/batch.rs @@ -20,11 +20,12 @@ use crate::client::broadcast::{BatchWriteResult, BroadcastOnce}; use crate::client::{Record, ResultHandle, WriteRecord}; use crate::compression::ArrowCompressionInfo; use crate::error::{Error, Result}; -use crate::metadata::{DataType, KvFormat, TablePath}; +use crate::metadata::{KvFormat, RowType, TablePath}; use crate::record::MemoryLogRecordsArrowBuilder; use crate::record::kv::KvRecordBatchBuilder; use bytes::Bytes; use std::cmp::max; +use std::sync::Arc; use std::sync::atomic::{AtomicBool, AtomicI32, Ordering}; #[allow(dead_code)] @@ -192,7 +193,7 @@ impl ArrowLogWriteBatch { table_path: TablePath, schema_id: i32, arrow_compression_info: ArrowCompressionInfo, - row_type: &DataType, + row_type: &RowType, bucket_id: BucketId, create_ms: i64, to_append_record_batch: bool, @@ -249,11 +250,12 @@ impl ArrowLogWriteBatch { pub struct KvWriteBatch { write_batch: InnerWriteBatch, kv_batch_builder: KvRecordBatchBuilder, - target_columns: Option>, + target_columns: Option>>, schema_id: i32, } impl KvWriteBatch { + pub const DEFAULT_WRITE_LIMIT: usize = 256; #[allow(clippy::too_many_arguments)] pub fn new( batch_id: i64, @@ -262,7 +264,7 @@ impl KvWriteBatch { write_limit: usize, kv_format: KvFormat, bucket_id: BucketId, - target_columns: Option>, + target_columns: Option>>, create_ms: i64, ) -> Self { let base = InnerWriteBatch::new(batch_id, table_path, create_ms, bucket_id); @@ -284,7 +286,7 @@ impl KvWriteBatch { } }; - let key = kv_write_record.key; + let key = kv_write_record.key.as_ref(); if self.schema_id != write_record.schema_id { return Err(Error::UnexpectedError { @@ -296,7 +298,7 @@ impl KvWriteBatch { }); }; - if self.target_columns.as_deref() != kv_write_record.target_columns { + if self.target_columns != kv_write_record.target_columns { return Err(Error::UnexpectedError { message: format!( "target columns {:?} of the write record to append are not the same as the current target columns {:?} in the batch.", @@ -307,14 +309,14 @@ impl KvWriteBatch { }); } - let row = kv_write_record.compacted_row.as_ref(); + let row_bytes = kv_write_record.row_bytes(); - if self.is_closed() || !self.kv_batch_builder.has_room_for_row(key, row) { + if self.is_closed() || !self.kv_batch_builder.has_room_for_row(key, row_bytes) { Ok(None) } else { // append successfully self.kv_batch_builder - .append_row(key, row) + .append_row(key, row_bytes) .map_err(|e| Error::UnexpectedError { message: "Failed to append row to KvWriteBatch".to_string(), source: Some(Box::new(e)), diff --git a/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs b/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs index 2370719efe..817101a41a 100644 --- a/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs +++ b/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs @@ -20,6 +20,7 @@ use crate::cluster::Cluster; use crate::error::Error::IllegalArgument; use crate::error::Result; use crate::metadata::TablePath; +use bytes::Bytes; use rand::Rng; use std::sync::atomic::{AtomicI32, Ordering}; @@ -28,7 +29,7 @@ pub trait BucketAssigner: Sync + Send { fn on_new_batch(&self, cluster: &Cluster, prev_bucket_id: i32); - fn assign_bucket(&self, bucket_key: Option<&[u8]>, cluster: &Cluster) -> Result; + fn assign_bucket(&self, bucket_key: Option<&Bytes>, cluster: &Cluster) -> Result; } #[derive(Debug)] @@ -94,7 +95,7 @@ impl BucketAssigner for StickyBucketAssigner { self.next_bucket(cluster, prev_bucket_id); } - fn assign_bucket(&self, _bucket_key: Option<&[u8]>, cluster: &Cluster) -> Result { + fn assign_bucket(&self, _bucket_key: Option<&Bytes>, cluster: &Cluster) -> Result { let bucket_id = self.current_bucket_id.load(Ordering::Relaxed); if bucket_id < 0 { Ok(self.next_bucket(cluster, bucket_id)) @@ -139,7 +140,7 @@ impl BucketAssigner for HashBucketAssigner { // do nothing } - fn assign_bucket(&self, bucket_key: Option<&[u8]>, _: &Cluster) -> Result { + fn assign_bucket(&self, bucket_key: Option<&Bytes>, _: &Cluster) -> Result { let key = bucket_key.ok_or_else(|| IllegalArgument { message: "no bucket key provided".to_string(), })?; @@ -181,7 +182,7 @@ mod tests { let assigner = HashBucketAssigner::new(4, ::of(None)); let cluster = Cluster::default(); let bucket = assigner - .assign_bucket(Some(b"key"), &cluster) + .assign_bucket(Some(&Bytes::from_static(b"key")), &cluster) .expect("bucket"); assert!((0..4).contains(&bucket)); } diff --git a/fluss-rust/crates/fluss/src/client/write/mod.rs b/fluss-rust/crates/fluss/src/client/write/mod.rs index 248218e076..dcc6795996 100644 --- a/fluss-rust/crates/fluss/src/client/write/mod.rs +++ b/fluss-rust/crates/fluss/src/client/write/mod.rs @@ -21,9 +21,10 @@ mod batch; use crate::client::broadcast::{self as client_broadcast, BatchWriteResult, BroadcastOnceReceiver}; use crate::error::Error; use crate::metadata::TablePath; -use crate::row::{CompactedRow, GenericRow}; +use crate::row::GenericRow; pub use accumulator::*; use arrow::array::RecordBatch; +use bytes::Bytes; use std::sync::Arc; pub(crate) mod broadcast; @@ -40,7 +41,7 @@ pub use writer_client::WriterClient; pub struct WriteRecord<'a> { record: Record<'a>, table_path: Arc, - bucket_key: Option<&'a [u8]>, + bucket_key: Option, schema_id: i32, write_format: WriteFormat, } @@ -61,25 +62,43 @@ pub enum LogWriteRecord<'a> { RecordBatch(Arc), } +#[derive(Clone)] +pub enum RowBytes<'a> { + Borrowed(&'a [u8]), + Owned(Bytes), +} + +impl<'a> RowBytes<'a> { + pub fn as_slice(&self) -> &[u8] { + match self { + RowBytes::Borrowed(slice) => slice, + RowBytes::Owned(bytes) => bytes.as_ref(), + } + } +} + pub struct KvWriteRecord<'a> { - // only valid for primary key table - key: &'a [u8], - target_columns: Option<&'a [usize]>, - compacted_row: Option>, + key: Bytes, + target_columns: Option>>, + row_bytes: Option>, } impl<'a> KvWriteRecord<'a> { fn new( - key: &'a [u8], - target_columns: Option<&'a [usize]>, - compacted_row: Option>, + key: Bytes, + target_columns: Option>>, + row_bytes: Option>, ) -> Self { KvWriteRecord { key, target_columns, - compacted_row, + row_bytes, } } + + pub fn row_bytes(&self) -> Option<&[u8]> { + self.row_bytes.as_ref().map(|rb| rb.as_slice()) + } } impl<'a> WriteRecord<'a> { @@ -110,17 +129,18 @@ impl<'a> WriteRecord<'a> { pub fn for_upsert( table_path: Arc, schema_id: i32, - bucket_key: &'a [u8], - key: &'a [u8], - target_columns: Option<&'a [usize]>, - row: CompactedRow<'a>, + key: Bytes, + bucket_key: Option, + write_format: WriteFormat, + target_columns: Option>>, + row_bytes: Option>, ) -> Self { Self { - record: Record::Kv(KvWriteRecord::new(key, target_columns, Some(row))), + record: Record::Kv(KvWriteRecord::new(key, target_columns, row_bytes)), table_path, - bucket_key: Some(bucket_key), + bucket_key, schema_id, - write_format: WriteFormat::CompactedKv, + write_format, } } } diff --git a/fluss-rust/crates/fluss/src/client/write/write_format.rs b/fluss-rust/crates/fluss/src/client/write/write_format.rs index 4a0c0d8afa..147152cae4 100644 --- a/fluss-rust/crates/fluss/src/client/write/write_format.rs +++ b/fluss-rust/crates/fluss/src/client/write/write_format.rs @@ -20,6 +20,7 @@ use crate::error::Result; use crate::metadata::KvFormat; use std::fmt::Display; +#[derive(Copy, Clone)] pub enum WriteFormat { ArrowLog, CompactedLog, diff --git a/fluss-rust/crates/fluss/src/client/write/writer_client.rs b/fluss-rust/crates/fluss/src/client/write/writer_client.rs index 22e0397212..65b04f5621 100644 --- a/fluss-rust/crates/fluss/src/client/write/writer_client.rs +++ b/fluss-rust/crates/fluss/src/client/write/writer_client.rs @@ -21,6 +21,7 @@ use crate::client::write::sender::Sender; use crate::client::{RecordAccumulator, ResultHandle, WriteRecord}; use crate::config::Config; use crate::metadata::TablePath; +use bytes::Bytes; use dashmap::DashMap; use std::sync::Arc; use tokio::sync::mpsc; @@ -90,8 +91,9 @@ impl WriterClient { pub async fn send(&self, record: &WriteRecord<'_>) -> Result { let table_path = &record.table_path; let cluster = self.metadata.get_cluster(); + let bucket_key = record.bucket_key.as_ref(); - let (bucket_assigner, bucket_id) = self.assign_bucket(table_path)?; + let (bucket_assigner, bucket_id) = self.assign_bucket(bucket_key, table_path)?; let mut result = self .accumulate @@ -101,7 +103,7 @@ impl WriterClient { if result.abort_record_for_new_batch { let prev_bucket_id = bucket_id; bucket_assigner.on_new_batch(&cluster, prev_bucket_id); - let bucket_id = bucket_assigner.assign_bucket(None, &cluster)?; + let bucket_id = bucket_assigner.assign_bucket(bucket_key, &cluster)?; result = self .accumulate .append(record, bucket_id, &cluster, false) @@ -116,6 +118,7 @@ impl WriterClient { } fn assign_bucket( &self, + bucket_key: Option<&Bytes>, table_path: &Arc, ) -> Result<(Arc>, i32)> { let cluster = self.metadata.get_cluster(); @@ -129,7 +132,7 @@ impl WriterClient { assigner } }; - let bucket_id = bucket_assigner.assign_bucket(None, &cluster)?; + let bucket_id = bucket_assigner.assign_bucket(bucket_key, &cluster)?; Ok((bucket_assigner, bucket_id)) } diff --git a/fluss-rust/crates/fluss/src/metadata/table.rs b/fluss-rust/crates/fluss/src/metadata/table.rs index da85b0c2da..8204e7c46c 100644 --- a/fluss-rust/crates/fluss/src/metadata/table.rs +++ b/fluss-rust/crates/fluss/src/metadata/table.rs @@ -16,7 +16,7 @@ // under the License. use crate::compression::ArrowCompressionInfo; -use crate::error::Error::InvalidTableError; +use crate::error::Error::{IllegalArgument, InvalidTableError}; use crate::error::{Error, Result}; use crate::metadata::DataLakeFormat; use crate::metadata::datatype::{DataField, DataType, RowType}; @@ -97,8 +97,8 @@ impl PrimaryKey { pub struct Schema { columns: Vec, primary_key: Option, - // must be Row data type kind - row_type: DataType, + row_type: RowType, + auto_increment_col_names: Vec, } impl Schema { @@ -118,7 +118,7 @@ impl Schema { self.primary_key.as_ref() } - pub fn row_type(&self) -> &DataType { + pub fn row_type(&self) -> &RowType { &self.row_type } @@ -144,12 +144,17 @@ impl Schema { pub fn column_names(&self) -> Vec<&str> { self.columns.iter().map(|c| c.name.as_str()).collect() } + + pub fn auto_increment_col_names(&self) -> &Vec { + &self.auto_increment_col_names + } } #[derive(Debug, Default)] pub struct SchemaBuilder { columns: Vec, primary_key: Option, + auto_increment_col_names: Vec, } impl SchemaBuilder { @@ -198,9 +203,36 @@ impl SchemaBuilder { self } + /// Declares a column to be auto-incremented. With an auto-increment column in the table, + /// whenever a new row is inserted into the table, the new row will be assigned with the next + /// available value from the auto-increment sequence. A table can have at most one auto + /// increment column. + pub fn enable_auto_increment(mut self, column_name: &str) -> Result { + if !self.auto_increment_col_names.is_empty() { + return Err(IllegalArgument { + message: "Multiple auto increment columns are not supported yet.".to_string(), + }); + } + + self.auto_increment_col_names.push(column_name.to_string()); + Ok(self) + } + pub fn build(&mut self) -> Result { let columns = Self::normalize_columns(&mut self.columns, self.primary_key.as_ref())?; + let column_names: HashSet<_> = columns.iter().map(|c| &c.name).collect(); + for auto_inc_col in &self.auto_increment_col_names { + if !column_names.contains(auto_inc_col) { + return Err(IllegalArgument { + message: format!( + "Auto increment column '{}' is not found in the schema columns.", + auto_inc_col + ), + }); + } + } + let data_fields = columns .iter() .map(|c| DataField { @@ -213,7 +245,8 @@ impl SchemaBuilder { Ok(Schema { columns, primary_key: self.primary_key.clone(), - row_type: DataType::Row(RowType::new(data_fields)), + row_type: RowType::new(data_fields), + auto_increment_col_names: self.auto_increment_col_names.clone(), }) } @@ -500,7 +533,7 @@ impl TableDescriptor { bucket_keys.retain(|k| !partition_keys.contains(k)); if bucket_keys.is_empty() { - return Err(Error::InvalidTableError { + return Err(InvalidTableError { message: format!( "Primary Key constraint {:?} should not be same with partition fields {:?}.", schema.primary_key().unwrap().column_names(), @@ -580,7 +613,7 @@ pub enum LogFormat { } impl Display for LogFormat { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { LogFormat::ARROW => { write!(f, "ARROW")?; @@ -612,7 +645,7 @@ pub enum KvFormat { } impl Display for KvFormat { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { KvFormat::COMPACTED => write!(f, "COMPACTED")?, KvFormat::INDEXED => write!(f, "INDEXED")?, @@ -626,7 +659,7 @@ impl KvFormat { match s.to_uppercase().as_str() { "INDEXED" => Ok(KvFormat::INDEXED), "COMPACTED" => Ok(KvFormat::COMPACTED), - _ => Err(Error::InvalidTableError { + _ => Err(InvalidTableError { message: format!("Unknown kv format: {s}"), }), } @@ -692,7 +725,7 @@ pub struct TableInfo { pub table_id: i64, pub schema_id: i32, pub schema: Schema, - pub row_type: DataType, + pub row_type: RowType, pub primary_keys: Vec, pub physical_primary_keys: Vec, pub bucket_keys: Vec, @@ -708,10 +741,7 @@ pub struct TableInfo { impl TableInfo { pub fn row_type(&self) -> &RowType { - match &self.row_type { - DataType::Row(row_type) => row_type, - _ => panic!("should be a row type"), - } + &self.row_type } } @@ -847,7 +877,7 @@ impl TableInfo { &self.schema } - pub fn get_row_type(&self) -> &DataType { + pub fn get_row_type(&self) -> &RowType { &self.row_type } @@ -946,8 +976,8 @@ impl TableInfo { } } -impl fmt::Display for TableInfo { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl Display for TableInfo { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( f, "TableInfo{{ table_path={:?}, table_id={}, schema_id={}, schema={:?}, physical_primary_keys={:?}, bucket_keys={:?}, partition_keys={:?}, num_buckets={}, properties={:?}, custom_properties={:?}, comment={:?}, created_time={}, modified_time={} }}", @@ -998,7 +1028,7 @@ impl TableBucket { } impl Display for TableBucket { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { if let Some(partition_id) = self.partition_id { write!( f, diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index 3c46f9b5cd..3c94b7208f 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -18,7 +18,7 @@ use crate::client::{LogWriteRecord, Record, WriteRecord}; use crate::compression::ArrowCompressionInfo; use crate::error::{Error, Result}; -use crate::metadata::DataType; +use crate::metadata::{DataType, RowType}; use crate::record::{ChangeType, ScanRecord}; use crate::row::{ColumnarRow, GenericRow}; use arrow::array::{ @@ -48,6 +48,7 @@ use std::{ sync::Arc, }; +use crate::error::Error::IllegalArgument; use arrow::ipc::writer::IpcWriteOptions; /// const for record batch pub const BASE_OFFSET_LENGTH: usize = 8; @@ -171,7 +172,7 @@ pub struct RowAppendRecordBatchBuilder { } impl RowAppendRecordBatchBuilder { - pub fn new(row_type: &DataType) -> Self { + pub fn new(row_type: &RowType) -> Self { let schema_ref = to_arrow_schema(row_type); let builders = Mutex::new( schema_ref @@ -251,7 +252,7 @@ impl ArrowRecordBatchInnerBuilder for RowAppendRecordBatchBuilder { impl MemoryLogRecordsArrowBuilder { pub fn new( schema_id: i32, - row_type: &DataType, + row_type: &RowType, to_append_record_batch: bool, arrow_compression_info: ArrowCompressionInfo, ) -> Self { @@ -329,7 +330,7 @@ impl MemoryLogRecordsArrowBuilder { // write arrow batch bytes let mut cursor = Cursor::new(&mut batch_bytes[..]); cursor.set_position(RECORD_BATCH_HEADER_SIZE as u64); - cursor.write_all(real_arrow_batch_bytes).unwrap(); + cursor.write_all(real_arrow_batch_bytes)?; let calcute_crc_bytes = &cursor.get_ref()[SCHEMA_ID_OFFSET..]; // then update crc @@ -562,16 +563,17 @@ impl LogRecordBatch { return Ok(RecordBatch::new_empty(read_context.target_schema.clone())); } - let data = self.data.get(RECORDS_OFFSET..).ok_or_else(|| { - crate::error::Error::UnexpectedError { + let data = self + .data + .get(RECORDS_OFFSET..) + .ok_or_else(|| Error::UnexpectedError { message: format!( "Corrupt log record batch: data length {} is less than RECORDS_OFFSET {}", self.data.len(), RECORDS_OFFSET ), source: None, - } - })?; + })?; read_context.record_batch(data) } } @@ -639,27 +641,20 @@ fn parse_ipc_message( Ok((batch_metadata, body_buffer, message.version())) } -pub fn to_arrow_schema(fluss_schema: &DataType) -> SchemaRef { - match &fluss_schema { - DataType::Row(row_type) => { - let fields: Vec = row_type - .fields() - .iter() - .map(|f| { - Field::new( - f.name(), - to_arrow_type(f.data_type()), - f.data_type().is_nullable(), - ) - }) - .collect(); +pub fn to_arrow_schema(fluss_schema: &RowType) -> SchemaRef { + let fields: Vec = fluss_schema + .fields() + .iter() + .map(|f| { + Field::new( + f.name(), + to_arrow_type(f.data_type()), + f.data_type().is_nullable(), + ) + }) + .collect(); - SchemaRef::new(arrow_schema::Schema::new(fields)) - } - _ => { - panic!("must be row data type.") - } - } + SchemaRef::new(arrow_schema::Schema::new(fields)) } pub fn to_arrow_type(fluss_type: &DataType) -> ArrowDataType { @@ -813,7 +808,7 @@ impl ReadContext { let mut reordering_indexes = Vec::with_capacity(projected_fields.len()); for &original_idx in &projected_fields { let pos = sorted_fields.binary_search(&original_idx).map_err(|_| { - Error::IllegalArgument { + IllegalArgument { message: format!( "Projection index {original_idx} is invalid for the current schema." ), @@ -857,7 +852,7 @@ impl ReadContext { let field_count = schema.fields().len(); for &index in projected_fields { if index >= field_count { - return Err(Error::IllegalArgument { + return Err(IllegalArgument { message: format!( "Projection index {index} is out of bounds for schema with {field_count} fields." ), @@ -869,7 +864,7 @@ impl ReadContext { pub fn project_schema(schema: SchemaRef, projected_fields: &[usize]) -> Result { Ok(SchemaRef::new(schema.project(projected_fields).map_err( - |e| Error::IllegalArgument { + |e| IllegalArgument { message: format!("Invalid projection: {e}"), }, )?)) @@ -1060,7 +1055,6 @@ pub struct MyVec(pub StreamReader); #[cfg(test)] mod tests { use super::*; - use crate::error::Error; use crate::metadata::{DataField, DataTypes}; #[test] @@ -1217,14 +1211,14 @@ mod tests { #[test] fn projection_rejects_out_of_bounds_index() { - let row_type = DataTypes::row(vec![ + let row_type = RowType::new(vec![ DataField::new("id".to_string(), DataTypes::int(), None), DataField::new("name".to_string(), DataTypes::string(), None), ]); let schema = to_arrow_schema(&row_type); let result = ReadContext::with_projection_pushdown(schema, vec![0, 2], false); - assert!(matches!(result, Err(Error::IllegalArgument { .. }))); + assert!(matches!(result, Err(IllegalArgument { .. }))); } #[test] diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs index 32f712f82e..eb89d69cda 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs @@ -370,12 +370,12 @@ impl Iterator for KvRecordIterator { #[cfg(test)] mod tests { use super::*; - use crate::metadata::{DataTypes, KvFormat, RowType}; + use crate::metadata::{DataTypes, KvFormat}; use crate::record::kv::test_util::TestReadContext; use crate::record::kv::{CURRENT_KV_MAGIC_VALUE, KvRecordBatchBuilder}; use crate::row::InternalRow; use crate::row::binary::BinaryWriter; - use crate::row::compacted::CompactedRow; + use bytes::{BufMut, BytesMut}; #[test] @@ -417,12 +417,11 @@ mod tests { let mut value1_writer = CompactedRowWriter::new(1); value1_writer.write_bytes(&[1, 2, 3, 4, 5]); - let row_type = RowType::with_data_types([DataTypes::bytes()].to_vec()); - let row = &CompactedRow::from_bytes(&row_type, value1_writer.buffer()); - builder.append_row(key1, Some(row)).unwrap(); + let row_bytes = value1_writer.buffer(); + builder.append_row(key1, Some(row_bytes)).unwrap(); let key2 = b"key2"; - builder.append_row::(key2, None).unwrap(); + builder.append_row(key2, None).unwrap(); let bytes = builder.build().unwrap(); diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs index e3da8640f7..0b65500fd4 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs @@ -28,7 +28,6 @@ use crate::record::kv::kv_record_batch::{ WRITE_CLIENT_ID_OFFSET, }; use crate::record::kv::{CURRENT_KV_MAGIC_VALUE, NO_BATCH_SEQUENCE, NO_WRITER_ID}; -use crate::row::BinaryRow; use bytes::{Bytes, BytesMut}; use std::io; @@ -88,14 +87,13 @@ impl KvRecordBatchBuilder { } } - /// Check if there is room for a new record containing the given key and row. + /// Check if there is room for a new record containing the given key and row bytes. /// If no records have been appended, this always returns true. - pub fn has_room_for_row(&self, key: &[u8], row: Option<&R>) -> bool { - let value = row.map(|r| r.as_bytes()); - self.size_in_bytes + KvRecord::size_of(key, value) <= self.write_limit + pub fn has_room_for_row(&self, key: &[u8], row_bytes: Option<&[u8]>) -> bool { + self.size_in_bytes + KvRecord::size_of(key, row_bytes) <= self.write_limit } - /// Append a KV record with a row value to the batch. + /// Append a KV record with row bytes to the batch. /// /// Returns an error if: /// - The builder has been aborted @@ -103,7 +101,7 @@ impl KvRecordBatchBuilder { /// - Adding this record would exceed the write limit /// - The maximum number of records is exceeded /// - The KV format is not COMPACTED - pub fn append_row(&mut self, key: &[u8], row: Option<&R>) -> io::Result<()> { + pub fn append_row(&mut self, key: &[u8], row_bytes: Option<&[u8]>) -> io::Result<()> { if self.kv_format != KvFormat::COMPACTED { return Err(io::Error::new( io::ErrorKind::InvalidInput, @@ -134,8 +132,7 @@ impl KvRecordBatchBuilder { )); } - let value = row.map(|r| r.as_bytes()); - let record_size = KvRecord::size_of(key, value); + let record_size = KvRecord::size_of(key, row_bytes); if self.size_in_bytes + record_size > self.write_limit { return Err(io::Error::new( io::ErrorKind::WriteZero, @@ -146,7 +143,7 @@ impl KvRecordBatchBuilder { )); } - let record_byte_size = KvRecord::write_to_buf(&mut self.buffer, key, value)?; + let record_byte_size = KvRecord::write_to_buf(&mut self.buffer, key, row_bytes)?; debug_assert_eq!(record_byte_size, record_size, "Record size mismatch"); self.current_record_number += 1; @@ -349,12 +346,12 @@ mod tests { let key1 = b"key1"; let value1 = create_test_row(b"value1"); - assert!(builder.has_room_for_row(key1, Some(&value1))); - builder.append_row(key1, Some(&value1)).unwrap(); + assert!(builder.has_room_for_row(key1, Some(value1.as_bytes()))); + builder.append_row(key1, Some(value1.as_bytes())).unwrap(); let key2 = b"key2"; - assert!(builder.has_room_for_row::(key2, None)); - builder.append_row::(key2, None).unwrap(); + assert!(builder.has_room_for_row(key2, None)); + builder.append_row(key2, None).unwrap(); builder.close().unwrap(); assert!(builder.is_closed()); @@ -369,35 +366,34 @@ mod tests { // Test lifecycle: abort behavior let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); let value = create_test_row(b"value"); - builder.append_row(b"key", Some(&value)).unwrap(); + builder.append_row(b"key", Some(value.as_bytes())).unwrap(); builder.abort(); - assert!(builder.append_row::(b"key2", None).is_err()); + assert!(builder.append_row(b"key2", None).is_err()); assert!(builder.build().is_err()); assert!(builder.close().is_err()); // Test lifecycle: close behavior let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); let value = create_test_row(b"value"); - builder.append_row(b"key", Some(&value)).unwrap(); + builder.append_row(b"key", Some(value.as_bytes())).unwrap(); builder.close().unwrap(); - assert!(builder.append_row::(b"key2", None).is_err()); + assert!(builder.append_row(b"key2", None).is_err()); assert!(builder.build().is_ok()); // Test KvFormat validation let mut row_writer = CompactedRowWriter::new(1); row_writer.write_int(42); - let row_type = RowType::with_data_types(vec![DataTypes::int()]); - let row = &CompactedRow::from_bytes(&row_type, row_writer.buffer()); + let row_bytes = row_writer.buffer(); // INDEXED format should reject append_row let mut indexed_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::INDEXED); - let result = indexed_builder.append_row(b"key", Some(row)); + let result = indexed_builder.append_row(b"key", Some(row_bytes)); assert!(result.is_err()); assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidInput); // COMPACTED format should accept append_row let mut compacted_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); - let result = compacted_builder.append_row(b"key", Some(row)); + let result = compacted_builder.append_row(b"key", Some(row_bytes)); assert!(result.is_ok()); } @@ -410,15 +406,17 @@ mod tests { let large_key = vec![0u8; 1000]; let large_value = vec![1u8; 1000]; let large_row = create_test_row(&large_value); - assert!(!builder.has_room_for_row(&large_key, Some(&large_row))); + assert!(!builder.has_room_for_row(&large_key, Some(large_row.as_bytes()))); let small_value = create_test_row(b"value"); - assert!(builder.has_room_for_row(b"key", Some(&small_value))); + assert!(builder.has_room_for_row(b"key", Some(small_value.as_bytes()))); // Test append enforcement - add small record first - builder.append_row(b"key", Some(&small_value)).unwrap(); + builder + .append_row(b"key", Some(small_value.as_bytes())) + .unwrap(); // Try to add large record that exceeds limit (reuse large_row from above) - let result = builder.append_row(b"key2", Some(&large_row)); + let result = builder.append_row(b"key2", Some(large_row.as_bytes())); assert!(result.is_err()); assert_eq!(result.unwrap_err().kind(), io::ErrorKind::WriteZero); } @@ -429,10 +427,12 @@ mod tests { builder.current_record_number = i32::MAX - 1; let value1 = create_test_row(b"value1"); - builder.append_row(b"key1", Some(&value1)).unwrap(); + builder + .append_row(b"key1", Some(value1.as_bytes())) + .unwrap(); let value2 = create_test_row(b"value2"); - let result = builder.append_row(b"key2", Some(&value2)); + let result = builder.append_row(b"key2", Some(value2.as_bytes())); assert!(result.is_err()); assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidInput); } @@ -452,13 +452,17 @@ mod tests { builder.set_writer_state(100, 5); let value1 = create_test_row(b"value1"); - builder.append_row(b"key1", Some(&value1)).unwrap(); + builder + .append_row(b"key1", Some(value1.as_bytes())) + .unwrap(); let bytes1 = builder.build().unwrap(); let len1 = bytes1.len(); // Append another record - this should invalidate the cache let value2 = create_test_row(b"value2"); - builder.append_row(b"key2", Some(&value2)).unwrap(); + builder + .append_row(b"key2", Some(value2.as_bytes())) + .unwrap(); let bytes2 = builder.build().unwrap(); let len2 = bytes2.len(); @@ -472,7 +476,7 @@ mod tests { let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); builder.set_writer_state(100, 5); let value = create_test_row(b"value"); - builder.append_row(b"key", Some(&value)).unwrap(); + builder.append_row(b"key", Some(value.as_bytes())).unwrap(); let bytes1 = builder.build().unwrap(); // Change writer state - this should invalidate the cache @@ -494,7 +498,6 @@ mod tests { fn test_builder_with_compacted_row_writer() -> crate::error::Result<()> { use crate::record::kv::KvRecordBatch; use crate::row::InternalRow; - use crate::row::compacted::CompactedRow; let mut builder = KvRecordBatchBuilder::new(1, 100000, KvFormat::COMPACTED); builder.set_writer_state(100, 5); @@ -504,26 +507,25 @@ mod tests { row_writer1.write_int(42); row_writer1.write_string("hello"); - let row_type = RowType::with_data_types([DataTypes::int(), DataTypes::string()].to_vec()); - let row1 = &CompactedRow::from_bytes(&row_type, row_writer1.buffer()); + let row_bytes1 = row_writer1.buffer(); let key1 = b"key1"; - assert!(builder.has_room_for_row(key1, Some(row1))); - builder.append_row(key1, Some(row1))?; + assert!(builder.has_room_for_row(key1, Some(row_bytes1))); + builder.append_row(key1, Some(row_bytes1))?; // Create and append second record let mut row_writer2 = CompactedRowWriter::new(2); row_writer2.write_int(100); row_writer2.write_string("world"); - let row2 = &CompactedRow::from_bytes(&row_type, row_writer2.buffer()); + let row_bytes2 = row_writer2.buffer(); let key2 = b"key2"; - builder.append_row(key2, Some(row2))?; + builder.append_row(key2, Some(row_bytes2))?; // Append a deletion record let key3 = b"key3"; - builder.append_row::(key3, None)?; + builder.append_row(key3, None)?; // Build and verify builder.close()?; diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs index fe6c6f0598..9236321668 100644 --- a/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs @@ -18,7 +18,7 @@ //! Default implementation of ReadContext with decoder caching. use super::ReadContext; -use crate::error::{Error, Result}; +use crate::error::Result; use crate::metadata::{KvFormat, Schema}; use crate::row::{RowDecoder, RowDecoderFactory}; use std::collections::HashMap; @@ -85,20 +85,7 @@ impl ReadContext for KvRecordReadContext { // Build decoder outside the lock to avoid blocking other threads let schema = self.schema_getter.get_schema(schema_id)?; - let row_type = match schema.row_type() { - crate::metadata::DataType::Row(row_type) => row_type.clone(), - other => { - return Err(Error::IoUnexpectedError { - message: format!( - "Schema {schema_id} has invalid row type: expected Row, got {other:?}" - ), - source: std::io::Error::new( - std::io::ErrorKind::InvalidData, - "Invalid row type", - ), - }); - } - }; + let row_type = schema.row_type().clone(); // Create decoder outside lock let decoder = RowDecoderFactory::create(self.kv_format.clone(), row_type)?; diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs index bc68ea10e5..35d684db6f 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs @@ -15,9 +15,10 @@ // specific language governing permissions and limitations // under the License. +use crate::client::WriteFormat; use crate::metadata::RowType; use crate::row::compacted::compacted_row_reader::{CompactedRowDeserializer, CompactedRowReader}; -use crate::row::{BinaryRow, GenericRow, InternalRow}; +use crate::row::{GenericRow, InternalRow}; use std::sync::{Arc, OnceLock}; // Reference implementation: @@ -69,10 +70,8 @@ impl<'a> CompactedRow<'a> { self.decoded_row .get_or_init(|| self.deserializer.deserialize(&self.reader)) } -} -impl BinaryRow for CompactedRow<'_> { - fn as_bytes(&self) -> &[u8] { + pub fn as_bytes(&self) -> &[u8] { self.data } } @@ -153,6 +152,14 @@ impl<'a> InternalRow for CompactedRow<'a> { fn get_timestamp_ltz(&self, pos: usize, precision: u32) -> crate::row::datum::TimestampLtz { self.decoded_row().get_timestamp_ltz(pos, precision) } + + fn as_encoded_bytes(&self, write_format: WriteFormat) -> Option<&[u8]> { + match write_format { + WriteFormat::CompactedKv => Some(self.as_bytes()), + WriteFormat::ArrowLog => None, + WriteFormat::CompactedLog => None, + } + } } #[cfg(test)] diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs index d1ad047a72..ac0100eefb 100644 --- a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs +++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs @@ -63,6 +63,18 @@ impl CompactedRowWriter { Bytes::copy_from_slice(&self.buffer[..self.position]) } + /// Flushes writer's ByteMut, resetting writer's inner state and returns Byte of flushed state + pub fn flush_bytes(&mut self) -> Bytes { + let used = self.buffer.split_to(self.position); + self.position = self.header_size_in_bytes; + if self.buffer.len() < self.header_size_in_bytes { + self.buffer.resize(self.header_size_in_bytes.max(64), 0); + } else { + self.buffer[..self.header_size_in_bytes].fill(0); + } + used.freeze() + } + fn ensure_capacity(&mut self, need_len: usize) { if (self.buffer.len() - self.position) < need_len { let new_len = cmp::max(self.buffer.len() * 2, self.buffer.len() + need_len); diff --git a/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs b/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs index 48b9f3ff58..20f28820cf 100644 --- a/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs +++ b/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs @@ -20,8 +20,9 @@ use crate::error::Result; use crate::metadata::RowType; use crate::row::Datum; use crate::row::binary::{BinaryRowFormat, BinaryWriter, ValueWriter}; -use crate::row::compacted::{CompactedRow, CompactedRowDeserializer, CompactedRowWriter}; -use crate::row::encode::{BinaryRow, RowEncoder}; +use crate::row::compacted::{CompactedRowDeserializer, CompactedRowWriter}; +use crate::row::encode::RowEncoder; +use bytes::Bytes; use std::sync::Arc; #[allow(dead_code)] @@ -65,12 +66,8 @@ impl RowEncoder for CompactedRowEncoder<'_> { .write_value(&mut self.writer, pos, &value) } - fn finish_row(&mut self) -> Result { - Ok(CompactedRow::deserialize( - Arc::clone(&self.compacted_row_deserializer), - self.arity, - self.writer.buffer(), - )) + fn finish_row(&mut self) -> Result { + Ok(self.writer.flush_bytes()) } fn close(&mut self) -> Result<()> { diff --git a/fluss-rust/crates/fluss/src/row/encode/mod.rs b/fluss-rust/crates/fluss/src/row/encode/mod.rs index c294ecf1d1..468d4d182b 100644 --- a/fluss-rust/crates/fluss/src/row/encode/mod.rs +++ b/fluss-rust/crates/fluss/src/row/encode/mod.rs @@ -22,7 +22,7 @@ use crate::error::Result; use crate::metadata::{DataLakeFormat, KvFormat, RowType}; use crate::row::encode::compacted_key_encoder::CompactedKeyEncoder; use crate::row::encode::compacted_row_encoder::CompactedRowEncoder; -use crate::row::{BinaryRow, Datum, InternalRow}; +use crate::row::{Datum, InternalRow}; use bytes::Bytes; /// An interface for encoding key of row into bytes. @@ -31,8 +31,9 @@ pub trait KeyEncoder { fn encode_key(&mut self, row: &dyn InternalRow) -> Result; } -#[allow(dead_code)] -impl dyn KeyEncoder { +pub struct KeyEncoderFactory; + +impl KeyEncoderFactory { /// Create a key encoder to encode the key bytes of the input row. /// # Arguments /// * `row_type` - the row type of the input row @@ -43,23 +44,21 @@ impl dyn KeyEncoder { /// key encoder pub fn of( row_type: &RowType, - key_fields: Vec, - data_lake_format: Option, + key_fields: &[String], + data_lake_format: &Option, ) -> Result> { match data_lake_format { Some(DataLakeFormat::Paimon) => { unimplemented!("KeyEncoder for Paimon format is currently unimplemented") } Some(DataLakeFormat::Lance) => Ok(Box::new(CompactedKeyEncoder::create_key_encoder( - row_type, - key_fields.as_slice(), + row_type, key_fields, )?)), Some(DataLakeFormat::Iceberg) => { unimplemented!("KeyEncoder for Iceberg format is currently unimplemented") } None => Ok(Box::new(CompactedKeyEncoder::create_key_encoder( - row_type, - key_fields.as_slice(), + row_type, key_fields, )?)), } } @@ -96,7 +95,7 @@ pub trait RowEncoder { /// /// # Returns /// * the written row - fn finish_row(&mut self) -> Result; + fn finish_row(&mut self) -> Result; /// Closes the row encoder /// @@ -110,8 +109,8 @@ pub struct RowEncoderFactory {} #[allow(dead_code)] impl RowEncoderFactory { - pub fn create(kv_format: KvFormat, row_type: &RowType) -> Result { - Self::create_for_field_types(kv_format, row_type.clone()) + pub fn create(kv_format: KvFormat, row_type: RowType) -> Result { + Self::create_for_field_types(kv_format, row_type) } pub fn create_for_field_types( diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index d2f640e4dc..bc8134dde6 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -23,9 +23,11 @@ mod decimal; pub mod binary; pub mod compacted; pub mod encode; -mod field_getter; +pub mod field_getter; mod row_decoder; +use crate::client::WriteFormat; +use bytes::Bytes; pub use column::*; pub use compacted::CompactedRow; pub use datum::*; @@ -33,9 +35,23 @@ pub use decimal::{Decimal, MAX_COMPACT_PRECISION}; pub use encode::KeyEncoder; pub use row_decoder::{CompactedRowDecoder, RowDecoder, RowDecoderFactory}; -pub trait BinaryRow: InternalRow { +pub struct BinaryRow<'a> { + data: BinaryDataWrapper<'a>, +} + +pub enum BinaryDataWrapper<'a> { + Bytes(Bytes), + Ref(&'a [u8]), +} + +impl<'a> BinaryRow<'a> { /// Returns the binary representation of this row as a byte slice. - fn as_bytes(&self) -> &[u8]; + pub fn as_bytes(&'a self) -> &'a [u8] { + match &self.data { + BinaryDataWrapper::Bytes(bytes) => bytes.as_ref(), + BinaryDataWrapper::Ref(r) => r, + } + } } // TODO make functions return Result for better error handling @@ -99,6 +115,11 @@ pub trait InternalRow { /// Returns the binary value at the given position fn get_bytes(&self, pos: usize) -> &[u8]; + + /// Returns encoded bytes if already encoded + fn as_encoded_bytes(&self, _write_format: WriteFormat) -> Option<&[u8]> { + None + } } pub struct GenericRow<'a> { From d263743b060b5771b78b6a58a199396e82ab08c2 Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Tue, 20 Jan 2026 15:56:22 +0000 Subject: [PATCH 077/287] feat: Introduce Kv table example (#181) --- fluss-rust/crates/examples/Cargo.toml | 6 +- .../crates/examples/src/example_kv_table.rs | 116 ++++++++++++++++++ .../crates/examples/src/example_table.rs | 2 + fluss-rust/crates/fluss/src/row/mod.rs | 1 + 4 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 fluss-rust/crates/examples/src/example_kv_table.rs diff --git a/fluss-rust/crates/examples/Cargo.toml b/fluss-rust/crates/examples/Cargo.toml index e1fa531764..117ceb2708 100644 --- a/fluss-rust/crates/examples/Cargo.toml +++ b/fluss-rust/crates/examples/Cargo.toml @@ -29,4 +29,8 @@ tokio = { workspace = true } clap = { workspace = true } [[example]] name = "example-table" -path = "src/example_table.rs" \ No newline at end of file +path = "src/example_table.rs" + +[[example]] +name = "example-upsert-lookup" +path = "src/example_kv_table.rs" \ No newline at end of file diff --git a/fluss-rust/crates/examples/src/example_kv_table.rs b/fluss-rust/crates/examples/src/example_kv_table.rs new file mode 100644 index 0000000000..75821a37e6 --- /dev/null +++ b/fluss-rust/crates/examples/src/example_kv_table.rs @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use clap::Parser; +use fluss::client::{FlussConnection, UpsertWriter}; +use fluss::config::Config; +use fluss::error::Result; +use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; +use fluss::row::{GenericRow, InternalRow}; + +#[tokio::main] +#[allow(dead_code)] +pub async fn main() -> Result<()> { + let mut config = Config::parse(); + config.bootstrap_server = Some("127.0.0.1:9123".to_string()); + + let conn = FlussConnection::new(config).await?; + + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("id", DataTypes::int()) + .column("name", DataTypes::string()) + .column("age", DataTypes::bigint()) + .primary_key(vec!["id".to_string()]) + .build()?, + ) + .build()?; + + let table_path = TablePath::new("fluss".to_owned(), "rust_upsert_lookup_example".to_owned()); + + let admin = conn.get_admin().await?; + admin + .create_table(&table_path, &table_descriptor, true) + .await?; + println!( + "Created KV Table:\n {}\n", + admin.get_table(&table_path).await? + ); + + let table = conn.get_table(&table_path).await?; + let table_upsert = table.new_upsert()?; + let mut upsert_writer = table_upsert.create_writer()?; + + println!("\n=== Upserting ==="); + for (id, name, age) in [(1, "Verso", 32i64), (2, "Noco", 25), (3, "Esquie", 35)] { + let mut row = GenericRow::new(); + row.set_field(0, id); + row.set_field(1, name); + row.set_field(2, age); + upsert_writer.upsert(&row).await?; + println!("Upserted: {row:?}"); + } + + println!("\n=== Looking up ==="); + let mut lookuper = table.new_lookup()?.create_lookuper()?; + + for id in 1..=2 { + let result = lookuper.lookup(&make_key(id)).await?; + let row = result.get_single_row()?.unwrap(); + println!( + "Found id={id}: name={}, age={}", + row.get_string(1), + row.get_long(2) + ); + } + + println!("\n=== Updating ==="); + let mut row = GenericRow::new(); + row.set_field(0, 1); + row.set_field(1, "Verso"); + row.set_field(2, 33i64); + upsert_writer.upsert(&row).await?; + println!("Updated: {row:?}"); + + let result = lookuper.lookup(&make_key(1)).await?; + let row = result.get_single_row()?.unwrap(); + println!( + "Verified update: name={}, age={}", + row.get_string(1), + row.get_long(2) + ); + + println!("\n=== Deleting ==="); + let mut row = GenericRow::new(); + row.set_field(0, 2); + upsert_writer.delete(&row).await?; + println!("Deleted: {row:?}"); + + let result = lookuper.lookup(&make_key(2)).await?; + if result.get_single_row()?.is_none() { + println!("Verified deletion"); + } + + Ok(()) +} + +fn make_key(id: i32) -> GenericRow<'static> { + let mut row = GenericRow::new(); + row.set_field(0, id); + row +} diff --git a/fluss-rust/crates/examples/src/example_table.rs b/fluss-rust/crates/examples/src/example_table.rs index 2d6ac53d8f..7333056feb 100644 --- a/fluss-rust/crates/examples/src/example_table.rs +++ b/fluss-rust/crates/examples/src/example_table.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +mod example_kv_table; + use clap::Parser; use fluss::client::FlussConnection; use fluss::config::Config; diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs index bc8134dde6..81a425408f 100644 --- a/fluss-rust/crates/fluss/src/row/mod.rs +++ b/fluss-rust/crates/fluss/src/row/mod.rs @@ -122,6 +122,7 @@ pub trait InternalRow { } } +#[derive(Debug)] pub struct GenericRow<'a> { pub values: Vec>, } From 76029f7d61b1d34b8b3dba35f0b5dc4ba4a67070 Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Wed, 21 Jan 2026 01:50:55 +0000 Subject: [PATCH 078/287] feat: KvWriteBatch wiring in Sender (#184) --- .../crates/examples/src/example_kv_table.rs | 4 +- .../crates/fluss/src/client/write/batch.rs | 4 + .../crates/fluss/src/client/write/sender.rs | 248 +++++++++++++----- .../crates/fluss/src/rpc/message/mod.rs | 1 + 4 files changed, 197 insertions(+), 60 deletions(-) diff --git a/fluss-rust/crates/examples/src/example_kv_table.rs b/fluss-rust/crates/examples/src/example_kv_table.rs index 75821a37e6..dcf7db8f05 100644 --- a/fluss-rust/crates/examples/src/example_kv_table.rs +++ b/fluss-rust/crates/examples/src/example_kv_table.rs @@ -69,7 +69,7 @@ pub async fn main() -> Result<()> { println!("\n=== Looking up ==="); let mut lookuper = table.new_lookup()?.create_lookuper()?; - for id in 1..=2 { + for id in 1..=3 { let result = lookuper.lookup(&make_key(id)).await?; let row = result.get_single_row()?.unwrap(); println!( @@ -98,6 +98,8 @@ pub async fn main() -> Result<()> { println!("\n=== Deleting ==="); let mut row = GenericRow::new(); row.set_field(0, 2); + row.set_field(1, ""); + row.set_field(2, 0i64); upsert_writer.delete(&row).await?; println!("Deleted: {row:?}"); diff --git a/fluss-rust/crates/fluss/src/client/write/batch.rs b/fluss-rust/crates/fluss/src/client/write/batch.rs index 2ddf5192ed..159e3136e8 100644 --- a/fluss-rust/crates/fluss/src/client/write/batch.rs +++ b/fluss-rust/crates/fluss/src/client/write/batch.rs @@ -336,6 +336,10 @@ impl KvWriteBatch { pub fn close(&mut self) -> Result<()> { self.kv_batch_builder.close() } + + pub fn target_columns(&self) -> Option<&Arc>> { + self.target_columns.as_ref() + } } #[cfg(test)] diff --git a/fluss-rust/crates/fluss/src/client/write/sender.rs b/fluss-rust/crates/fluss/src/client/write/sender.rs index 7ea24e30f6..ceed245621 100644 --- a/fluss-rust/crates/fluss/src/client/write/sender.rs +++ b/fluss-rust/crates/fluss/src/client/write/sender.rs @@ -17,11 +17,16 @@ use crate::client::broadcast; use crate::client::metadata::Metadata; +use crate::client::write::batch::WriteBatch; use crate::client::{ReadyWriteBatch, RecordAccumulator}; +use crate::error::Error::UnexpectedError; use crate::error::{FlussError, Result}; use crate::metadata::{TableBucket, TablePath}; -use crate::proto::ProduceLogResponse; -use crate::rpc::message::ProduceLogRequest; +use crate::proto::{ + PbProduceLogRespForBucket, PbPutKvRespForBucket, ProduceLogResponse, PutKvResponse, +}; +use crate::rpc::ServerConnection; +use crate::rpc::message::{ProduceLogRequest, PutKvRequest}; use log::warn; use parking_lot::Mutex; use std::collections::{HashMap, HashSet}; @@ -182,23 +187,22 @@ impl Sender { .iter() .filter_map(|bucket| records_by_bucket.remove(bucket)) .collect(); + if request_batches.is_empty() { continue; } - let request = match ProduceLogRequest::new( + + let write_request = match Self::build_write_request( table_id, acks, self.max_request_timeout_ms, &mut request_batches, ) { - Ok(request) => request, + Ok(req) => req, Err(e) => { self.handle_batches_with_local_error( - table_buckets - .iter() - .filter_map(|bucket| records_by_bucket.remove(bucket)) - .collect(), - format!("Failed to build produce request: {e}"), + request_batches, + format!("Failed to build write request: {e}"), ) .await?; continue; @@ -211,27 +215,12 @@ impl Sender { records_by_bucket.insert(request_batch.table_bucket.clone(), request_batch); } - let response = match connection.request(request).await { - Ok(response) => response, - Err(e) => { - self.handle_batches_with_error( - table_buckets - .iter() - .filter_map(|bucket| records_by_bucket.remove(bucket)) - .collect(), - FlussError::NetworkException, - format!("Failed to send produce request: {e}"), - ) - .await?; - continue; - } - }; - - self.handle_produce_response( + self.send_and_handle_response( + &connection, + write_request, table_id, &table_buckets, &mut records_by_bucket, - response, ) .await?; } @@ -239,50 +228,120 @@ impl Sender { Ok(()) } - async fn handle_produce_response( + fn build_write_request( + table_id: i64, + acks: i16, + timeout_ms: i32, + request_batches: &mut [ReadyWriteBatch], + ) -> Result { + let first_batch = &request_batches.first().unwrap().write_batch; + + let request = match first_batch { + WriteBatch::ArrowLog(_) => { + let req = ProduceLogRequest::new(table_id, acks, timeout_ms, request_batches)?; + WriteRequest::ProduceLog(req) + } + WriteBatch::Kv(kv_write_batch) => { + let target_columns = kv_write_batch.target_columns(); + for batch in request_batches.iter().skip(1) { + match &batch.write_batch { + WriteBatch::ArrowLog(_) => { + return Err(UnexpectedError { + message: "Expecting KvWriteBatch but found ArrowLogWriteBatch" + .to_string(), + source: None, + }); + } + WriteBatch::Kv(kvb) => { + if target_columns != kvb.target_columns() { + return Err(UnexpectedError { + message: format!( + "All the write batches to make put kv request should have the same target columns, but got {:?} and {:?}.", + target_columns, + kvb.target_columns() + ), + source: None, + }); + } + } + } + } + let cols = target_columns + .map(|arc| arc.iter().map(|&c| c as i32).collect()) + .unwrap_or_default(); + let req = PutKvRequest::new(table_id, acks, timeout_ms, cols, request_batches)?; + WriteRequest::PutKv(req) + } + }; + + Ok(request) + } + + async fn send_and_handle_response( + &self, + connection: &ServerConnection, + write_request: WriteRequest, + table_id: i64, + table_buckets: &[TableBucket], + records_by_bucket: &mut HashMap, + ) -> Result<()> { + macro_rules! send { + ($request:expr) => { + match connection.request($request).await { + Ok(response) => { + self.handle_write_response( + table_id, + table_buckets, + records_by_bucket, + response, + ) + .await + } + Err(e) => { + self.handle_batches_with_error( + table_buckets + .iter() + .filter_map(|b| records_by_bucket.remove(b)) + .collect(), + FlussError::NetworkException, + format!("Failed to send write request: {e}"), + ) + .await + } + } + }; + } + + match write_request { + WriteRequest::ProduceLog(req) => send!(req), + WriteRequest::PutKv(req) => send!(req), + } + } + + async fn handle_write_response( &self, table_id: i64, request_buckets: &[TableBucket], records_by_bucket: &mut HashMap, - response: ProduceLogResponse, + response: R, ) -> Result<()> { let mut invalid_metadata_tables: HashSet = HashSet::new(); let mut pending_buckets: HashSet = request_buckets.iter().cloned().collect(); - for produce_log_response_for_bucket in response.buckets_resp.iter() { - let tb = TableBucket::new(table_id, produce_log_response_for_bucket.bucket_id); + for bucket_resp in response.buckets_resp() { + let tb = TableBucket::new(table_id, bucket_resp.bucket_id()); let Some(ready_batch) = records_by_bucket.remove(&tb) else { panic!("Missing ready batch for table bucket {tb}"); }; pending_buckets.remove(&tb); - if let Some(error_code) = produce_log_response_for_bucket.error_code { - if error_code == FlussError::None.code() { - self.complete_batch(ready_batch); - continue; - } - - let error = FlussError::for_code(error_code); - let message = produce_log_response_for_bucket - .error_message - .clone() - .unwrap_or_else(|| error.message().to_string()); - if let Some(table_path) = self - .handle_write_batch_error(ready_batch, error, message) - .await? - { - invalid_metadata_tables.insert(table_path); - } - } else { - self.complete_batch(ready_batch) - } - } - if !pending_buckets.is_empty() { - for bucket in pending_buckets { - if let Some(ready_batch) = records_by_bucket.remove(&bucket) { - let message = - format!("Missing response for table bucket {bucket} in produce response."); - let error = FlussError::UnknownServerError; + match bucket_resp.error_code() { + Some(code) if code != FlussError::None.code() => { + let error = FlussError::for_code(code); + let message = bucket_resp + .error_message() + .cloned() + .unwrap_or_else(|| error.message().to_string()); if let Some(table_path) = self .handle_write_batch_error(ready_batch, error, message) .await? @@ -290,8 +349,25 @@ impl Sender { invalid_metadata_tables.insert(table_path); } } + _ => self.complete_batch(ready_batch), } } + + for bucket in pending_buckets { + if let Some(ready_batch) = records_by_bucket.remove(&bucket) { + if let Some(table_path) = self + .handle_write_batch_error( + ready_batch, + FlussError::UnknownServerError, + format!("Missing response for table bucket {bucket}"), + ) + .await? + { + invalid_metadata_tables.insert(table_path); + } + } + } + self.update_metadata_if_needed(invalid_metadata_tables) .await; Ok(()) @@ -450,6 +526,60 @@ impl Sender { } } +enum WriteRequest { + ProduceLog(ProduceLogRequest), + PutKv(PutKvRequest), +} + +trait BucketResponse { + fn bucket_id(&self) -> i32; + fn error_code(&self) -> Option; + fn error_message(&self) -> Option<&String>; +} + +impl BucketResponse for PbProduceLogRespForBucket { + fn bucket_id(&self) -> i32 { + self.bucket_id + } + fn error_code(&self) -> Option { + self.error_code + } + fn error_message(&self) -> Option<&String> { + self.error_message.as_ref() + } +} + +impl BucketResponse for PbPutKvRespForBucket { + fn bucket_id(&self) -> i32 { + self.bucket_id + } + fn error_code(&self) -> Option { + self.error_code + } + fn error_message(&self) -> Option<&String> { + self.error_message.as_ref() + } +} + +trait WriteResponse { + type BucketResp: BucketResponse; + fn buckets_resp(&self) -> &[Self::BucketResp]; +} + +impl WriteResponse for ProduceLogResponse { + type BucketResp = PbProduceLogRespForBucket; + fn buckets_resp(&self) -> &[Self::BucketResp] { + &self.buckets_resp + } +} + +impl WriteResponse for PutKvResponse { + type BucketResp = PbPutKvRespForBucket; + fn buckets_resp(&self) -> &[Self::BucketResp] { + &self.buckets_resp + } +} + #[cfg(test)] mod tests { use super::*; @@ -563,7 +693,7 @@ mod tests { }; sender - .handle_produce_response(1, &request_buckets, &mut records_by_bucket, response) + .handle_write_response(1, &request_buckets, &mut records_by_bucket, response) .await?; let batch_result = handle.wait().await?; diff --git a/fluss-rust/crates/fluss/src/rpc/message/mod.rs b/fluss-rust/crates/fluss/src/rpc/message/mod.rs index 4e6c8e1eaf..881a64f687 100644 --- a/fluss-rust/crates/fluss/src/rpc/message/mod.rs +++ b/fluss-rust/crates/fluss/src/rpc/message/mod.rs @@ -57,6 +57,7 @@ pub use list_offsets::*; pub use list_tables::*; pub use lookup::*; pub use produce_log::*; +pub use put_kv::*; pub use table_exists::*; pub use update_metadata::*; From 203093cdb46a2825fc0e743cb43eb19152f83d0c Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Wed, 21 Jan 2026 12:56:37 +0000 Subject: [PATCH 079/287] feat: Fix TableLookup so that SchemaId field bytes are not passed to CompactedRow::from_bytes(), these fields can be skipped as current rust client implementation does not check schema and already passes row_type (#190) --- fluss-rust/crates/fluss/src/client/table/lookup.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fluss-rust/crates/fluss/src/client/table/lookup.rs b/fluss-rust/crates/fluss/src/client/table/lookup.rs index cd23503a5b..4e89176a61 100644 --- a/fluss-rust/crates/fluss/src/client/table/lookup.rs +++ b/fluss-rust/crates/fluss/src/client/table/lookup.rs @@ -20,6 +20,7 @@ use crate::client::connection::FlussConnection; use crate::client::metadata::Metadata; use crate::error::{Error, Result}; use crate::metadata::{RowType, TableBucket, TableInfo}; +use crate::record::kv::SCHEMA_ID_LENGTH; use crate::row::InternalRow; use crate::row::compacted::CompactedRow; use crate::row::encode::{KeyEncoder, KeyEncoderFactory}; @@ -64,7 +65,10 @@ impl<'a> LookupResult<'a> { pub fn get_single_row(&self) -> Result>> { match self.rows.len() { 0 => Ok(None), - 1 => Ok(Some(CompactedRow::from_bytes(self.row_type, &self.rows[0]))), + 1 => Ok(Some(CompactedRow::from_bytes( + self.row_type, + &self.rows[0][SCHEMA_ID_LENGTH..], + ))), _ => Err(Error::UnexpectedError { message: "LookupResult contains multiple rows, use get_rows() instead".to_string(), source: None, @@ -76,7 +80,8 @@ impl<'a> LookupResult<'a> { pub fn get_rows(&self) -> Vec> { self.rows .iter() - .map(|bytes| CompactedRow::from_bytes(self.row_type, bytes)) + // TODO Add schema id check and fetch when implementing prefix lookup + .map(|bytes| CompactedRow::from_bytes(self.row_type, &bytes[SCHEMA_ID_LENGTH..])) .collect() } } From 2f3269d2c04429ef5671262225dac6b6c26f169d Mon Sep 17 00:00:00 2001 From: Anton Borisov <148864665+fresh-borzoni@users.noreply.github.com> Date: Fri, 23 Jan 2026 03:34:58 +0000 Subject: [PATCH 080/287] feat: Arrow serialization for decimal and temporal types (#196) --- .../src/client/table/log_fetch_buffer.rs | 12 +- .../crates/fluss/src/client/table/scanner.rs | 8 +- .../fluss/src/client/write/accumulator.rs | 2 +- .../crates/fluss/src/client/write/batch.rs | 8 +- fluss-rust/crates/fluss/src/record/arrow.rs | 569 ++++++++++++++---- fluss-rust/crates/fluss/src/row/datum.rs | 468 +++++++++++++- 6 files changed, 916 insertions(+), 151 deletions(-) diff --git a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs index ca0a2532c8..214a79cd7d 100644 --- a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs +++ b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs @@ -657,13 +657,13 @@ mod tests { use std::sync::Arc; use std::time::Duration; - fn test_read_context() -> ReadContext { + fn test_read_context() -> Result { let row_type = RowType::new(vec![DataField::new( "id".to_string(), DataTypes::int(), None, )]); - ReadContext::new(to_arrow_schema(&row_type), false) + Ok(ReadContext::new(to_arrow_schema(&row_type)?, false)) } struct ErrorPendingFetch { @@ -689,7 +689,7 @@ mod tests { #[tokio::test] async fn await_not_empty_returns_wakeup_error() { - let buffer = LogFetchBuffer::new(test_read_context()); + let buffer = LogFetchBuffer::new(test_read_context().unwrap()); buffer.wakeup(); let result = buffer.await_not_empty(Duration::from_millis(10)).await; @@ -698,7 +698,7 @@ mod tests { #[tokio::test] async fn await_not_empty_returns_pending_error() { - let buffer = LogFetchBuffer::new(test_read_context()); + let buffer = LogFetchBuffer::new(test_read_context().unwrap()); let table_bucket = TableBucket::new(1, 0); buffer.pend(Box::new(ErrorPendingFetch { table_bucket: table_bucket.clone(), @@ -728,7 +728,7 @@ mod tests { compression_type: ArrowCompressionType::None, compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, }, - ); + )?; let mut row = GenericRow::new(); row.set_field(0, 1_i32); @@ -738,7 +738,7 @@ mod tests { let data = builder.build()?; let log_records = LogRecordsBatches::new(data.clone()); - let read_context = ReadContext::new(to_arrow_schema(&row_type), false); + let read_context = ReadContext::new(to_arrow_schema(&row_type)?, false); let mut fetch = DefaultCompletedFetch::new( TableBucket::new(1, 0), log_records, diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs index e9b2ce106d..cf0b257f00 100644 --- a/fluss-rust/crates/fluss/src/client/table/scanner.rs +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -470,7 +470,7 @@ impl LogFetcher { log_scanner_status: Arc, projected_fields: Option>, ) -> Result { - let full_arrow_schema = to_arrow_schema(table_info.get_row_type()); + let full_arrow_schema = to_arrow_schema(table_info.get_row_type())?; let read_context = Self::create_read_context(full_arrow_schema.clone(), projected_fields.clone(), false)?; let remote_read_context = @@ -1445,7 +1445,7 @@ mod tests { compression_type: ArrowCompressionType::None, compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, }, - ); + )?; let record = WriteRecord::for_append( table_path, 1, @@ -1477,7 +1477,7 @@ mod tests { let data = build_records(&table_info, Arc::new(table_path))?; let log_records = LogRecordsBatches::new(data.clone()); - let read_context = ReadContext::new(to_arrow_schema(table_info.get_row_type()), false); + let read_context = ReadContext::new(to_arrow_schema(table_info.get_row_type())?, false); let completed = DefaultCompletedFetch::new(bucket.clone(), log_records, data.len(), read_context, 0, 0); fetcher.log_fetch_buffer.add(Box::new(completed)); @@ -1506,7 +1506,7 @@ mod tests { let bucket = TableBucket::new(1, 0); let data = build_records(&table_info, Arc::new(table_path))?; let log_records = LogRecordsBatches::new(data.clone()); - let read_context = ReadContext::new(to_arrow_schema(table_info.get_row_type()), false); + let read_context = ReadContext::new(to_arrow_schema(table_info.get_row_type())?, false); let mut completed: Box = Box::new(DefaultCompletedFetch::new( bucket, log_records, diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs index fb7b54470d..46c822c1a2 100644 --- a/fluss-rust/crates/fluss/src/client/write/accumulator.rs +++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs @@ -112,7 +112,7 @@ impl RecordAccumulator { bucket_id, current_time_ms(), matches!(&record.record, Record::Log(LogWriteRecord::RecordBatch(_))), - )), + )?), Record::Kv(kv_record) => Kv(KvWriteBatch::new( self.batch_id.fetch_add(1, Ordering::Relaxed), table_path.as_ref().clone(), diff --git a/fluss-rust/crates/fluss/src/client/write/batch.rs b/fluss-rust/crates/fluss/src/client/write/batch.rs index 159e3136e8..78381c6e76 100644 --- a/fluss-rust/crates/fluss/src/client/write/batch.rs +++ b/fluss-rust/crates/fluss/src/client/write/batch.rs @@ -197,18 +197,18 @@ impl ArrowLogWriteBatch { bucket_id: BucketId, create_ms: i64, to_append_record_batch: bool, - ) -> Self { + ) -> Result { let base = InnerWriteBatch::new(batch_id, table_path, create_ms, bucket_id); - Self { + Ok(Self { write_batch: base, arrow_builder: MemoryLogRecordsArrowBuilder::new( schema_id, row_type, to_append_record_batch, arrow_compression_info, - ), + )?, built_records: None, - } + }) } pub fn batch_id(&self) -> i64 { diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs index 3c94b7208f..39114d3273 100644 --- a/fluss-rust/crates/fluss/src/record/arrow.rs +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -22,9 +22,12 @@ use crate::metadata::{DataType, RowType}; use crate::record::{ChangeType, ScanRecord}; use crate::row::{ColumnarRow, GenericRow}; use arrow::array::{ - ArrayBuilder, ArrayRef, BinaryBuilder, BooleanBuilder, Float32Builder, Float64Builder, - Int8Builder, Int16Builder, Int32Builder, Int64Builder, StringBuilder, UInt8Builder, - UInt16Builder, UInt32Builder, UInt64Builder, + ArrayBuilder, ArrayRef, BinaryBuilder, BooleanBuilder, Date32Builder, Decimal128Builder, + Float32Builder, Float64Builder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, + StringBuilder, Time32MillisecondBuilder, Time32SecondBuilder, Time64MicrosecondBuilder, + Time64NanosecondBuilder, TimestampMicrosecondBuilder, TimestampMillisecondBuilder, + TimestampNanosecondBuilder, TimestampSecondBuilder, UInt8Builder, UInt16Builder, UInt32Builder, + UInt64Builder, }; use arrow::{ array::RecordBatch, @@ -42,7 +45,6 @@ use byteorder::WriteBytesExt; use byteorder::{ByteOrder, LittleEndian}; use bytes::Bytes; use crc32c::crc32c; -use parking_lot::Mutex; use std::{ io::{Cursor, Write}, sync::Arc, @@ -113,7 +115,7 @@ pub struct MemoryLogRecordsArrowBuilder { } pub trait ArrowRecordBatchInnerBuilder: Send + Sync { - fn build_arrow_record_batch(&self) -> Result>; + fn build_arrow_record_batch(&mut self) -> Result>; fn append(&mut self, row: &GenericRow) -> Result; @@ -133,7 +135,7 @@ pub struct PrebuiltRecordBatchBuilder { } impl ArrowRecordBatchInnerBuilder for PrebuiltRecordBatchBuilder { - fn build_arrow_record_batch(&self) -> Result> { + fn build_arrow_record_batch(&mut self) -> Result> { Ok(self.arrow_record_batch.as_ref().unwrap().clone()) } @@ -167,66 +169,132 @@ impl ArrowRecordBatchInnerBuilder for PrebuiltRecordBatchBuilder { pub struct RowAppendRecordBatchBuilder { table_schema: SchemaRef, - arrow_column_builders: Mutex>>, + arrow_column_builders: Vec>, records_count: i32, } impl RowAppendRecordBatchBuilder { - pub fn new(row_type: &RowType) -> Self { - let schema_ref = to_arrow_schema(row_type); - let builders = Mutex::new( - schema_ref - .fields() - .iter() - .map(|field| Self::create_builder(field.data_type())) - .collect(), - ); - Self { + pub fn new(row_type: &RowType) -> Result { + let schema_ref = to_arrow_schema(row_type)?; + let builders: Result> = schema_ref + .fields() + .iter() + .map(|field| Self::create_builder(field.data_type())) + .collect(); + Ok(Self { table_schema: schema_ref.clone(), - arrow_column_builders: builders, + arrow_column_builders: builders?, records_count: 0, - } + }) } - fn create_builder(data_type: &arrow_schema::DataType) -> Box { + fn create_builder(data_type: &arrow_schema::DataType) -> Result> { match data_type { - arrow_schema::DataType::Int8 => Box::new(Int8Builder::new()), - arrow_schema::DataType::Int16 => Box::new(Int16Builder::new()), - arrow_schema::DataType::Int32 => Box::new(Int32Builder::new()), - arrow_schema::DataType::Int64 => Box::new(Int64Builder::new()), - arrow_schema::DataType::UInt8 => Box::new(UInt8Builder::new()), - arrow_schema::DataType::UInt16 => Box::new(UInt16Builder::new()), - arrow_schema::DataType::UInt32 => Box::new(UInt32Builder::new()), - arrow_schema::DataType::UInt64 => Box::new(UInt64Builder::new()), - arrow_schema::DataType::Float32 => Box::new(Float32Builder::new()), - arrow_schema::DataType::Float64 => Box::new(Float64Builder::new()), - arrow_schema::DataType::Boolean => Box::new(BooleanBuilder::new()), - arrow_schema::DataType::Utf8 => Box::new(StringBuilder::new()), - arrow_schema::DataType::Binary => Box::new(BinaryBuilder::new()), - dt => panic!("Unsupported data type: {dt:?}"), + arrow_schema::DataType::Int8 => Ok(Box::new(Int8Builder::new())), + arrow_schema::DataType::Int16 => Ok(Box::new(Int16Builder::new())), + arrow_schema::DataType::Int32 => Ok(Box::new(Int32Builder::new())), + arrow_schema::DataType::Int64 => Ok(Box::new(Int64Builder::new())), + arrow_schema::DataType::UInt8 => Ok(Box::new(UInt8Builder::new())), + arrow_schema::DataType::UInt16 => Ok(Box::new(UInt16Builder::new())), + arrow_schema::DataType::UInt32 => Ok(Box::new(UInt32Builder::new())), + arrow_schema::DataType::UInt64 => Ok(Box::new(UInt64Builder::new())), + arrow_schema::DataType::Float32 => Ok(Box::new(Float32Builder::new())), + arrow_schema::DataType::Float64 => Ok(Box::new(Float64Builder::new())), + arrow_schema::DataType::Boolean => Ok(Box::new(BooleanBuilder::new())), + arrow_schema::DataType::Utf8 => Ok(Box::new(StringBuilder::new())), + arrow_schema::DataType::Binary => Ok(Box::new(BinaryBuilder::new())), + arrow_schema::DataType::Decimal128(precision, scale) => { + let builder = Decimal128Builder::new() + .with_precision_and_scale(*precision, *scale) + .map_err(|e| Error::IllegalArgument { + message: format!( + "Invalid decimal precision {} or scale {}: {}", + precision, scale, e + ), + })?; + Ok(Box::new(builder)) + } + arrow_schema::DataType::Date32 => Ok(Box::new(Date32Builder::new())), + arrow_schema::DataType::Time32(unit) => match unit { + arrow_schema::TimeUnit::Second => Ok(Box::new(Time32SecondBuilder::new())), + arrow_schema::TimeUnit::Millisecond => { + Ok(Box::new(Time32MillisecondBuilder::new())) + } + _ => Err(Error::IllegalArgument { + message: format!( + "Time32 only supports Second and Millisecond units, got: {:?}", + unit + ), + }), + }, + arrow_schema::DataType::Time64(unit) => match unit { + arrow_schema::TimeUnit::Microsecond => { + Ok(Box::new(Time64MicrosecondBuilder::new())) + } + arrow_schema::TimeUnit::Nanosecond => Ok(Box::new(Time64NanosecondBuilder::new())), + _ => Err(Error::IllegalArgument { + message: format!( + "Time64 only supports Microsecond and Nanosecond units, got: {:?}", + unit + ), + }), + }, + arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _) => { + Ok(Box::new(TimestampSecondBuilder::new())) + } + arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _) => { + Ok(Box::new(TimestampMillisecondBuilder::new())) + } + arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _) => { + Ok(Box::new(TimestampMicrosecondBuilder::new())) + } + arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => { + Ok(Box::new(TimestampNanosecondBuilder::new())) + } + dt => Err(Error::IllegalArgument { + message: format!("Unsupported data type: {dt:?}"), + }), } } } impl ArrowRecordBatchInnerBuilder for RowAppendRecordBatchBuilder { - fn build_arrow_record_batch(&self) -> Result> { - let arrays = self + fn build_arrow_record_batch(&mut self) -> Result> { + let arrays: Result> = self .arrow_column_builders - .lock() .iter_mut() - .map(|b| b.finish()) - .collect::>(); + .enumerate() + .map(|(idx, b)| { + let array = b.finish(); + let expected_type = self.table_schema.field(idx).data_type(); + + // Validate array type matches schema + if array.data_type() != expected_type { + return Err(Error::IllegalArgument { + message: format!( + "Builder type mismatch at column {}: expected {:?}, got {:?}", + idx, + expected_type, + array.data_type() + ), + }); + } + + Ok(array) + }) + .collect(); + Ok(Arc::new(RecordBatch::try_new( self.table_schema.clone(), - arrays, + arrays?, )?)) } fn append(&mut self, row: &GenericRow) -> Result { for (idx, value) in row.values.iter().enumerate() { - let mut builder_binding = self.arrow_column_builders.lock(); - let builder = builder_binding.get_mut(idx).unwrap(); - value.append_to(builder.as_mut())?; + let field_type = self.table_schema.field(idx).data_type(); + let builder = self.arrow_column_builders.get_mut(idx).unwrap(); + value.append_to(builder.as_mut(), field_type)?; } self.records_count += 1; Ok(true) @@ -255,15 +323,15 @@ impl MemoryLogRecordsArrowBuilder { row_type: &RowType, to_append_record_batch: bool, arrow_compression_info: ArrowCompressionInfo, - ) -> Self { + ) -> Result { let arrow_batch_builder: Box = { if to_append_record_batch { Box::new(PrebuiltRecordBatchBuilder::default()) } else { - Box::new(RowAppendRecordBatchBuilder::new(row_type)) + Box::new(RowAppendRecordBatchBuilder::new(row_type)?) } }; - MemoryLogRecordsArrowBuilder { + Ok(MemoryLogRecordsArrowBuilder { base_log_offset: BUILDER_DEFAULT_OFFSET, schema_id, magic: CURRENT_LOG_MAGIC_VALUE, @@ -272,7 +340,7 @@ impl MemoryLogRecordsArrowBuilder { is_closed: false, arrow_record_batch_builder: arrow_batch_builder, arrow_compression_info, - } + }) } pub fn append(&mut self, record: &WriteRecord) -> Result { @@ -302,7 +370,7 @@ impl MemoryLogRecordsArrowBuilder { self.is_closed = true; } - pub fn build(&self) -> Result> { + pub fn build(&mut self) -> Result> { // serialize arrow batch let mut arrow_batch_bytes = vec![]; let table_schema = self.arrow_record_batch_builder.schema(); @@ -641,24 +709,24 @@ fn parse_ipc_message( Ok((batch_metadata, body_buffer, message.version())) } -pub fn to_arrow_schema(fluss_schema: &RowType) -> SchemaRef { - let fields: Vec = fluss_schema +pub fn to_arrow_schema(fluss_schema: &RowType) -> Result { + let fields: Result> = fluss_schema .fields() .iter() .map(|f| { - Field::new( + Ok(Field::new( f.name(), - to_arrow_type(f.data_type()), + to_arrow_type(f.data_type())?, f.data_type().is_nullable(), - ) + )) }) .collect(); - SchemaRef::new(arrow_schema::Schema::new(fields)) + Ok(SchemaRef::new(arrow_schema::Schema::new(fields?))) } -pub fn to_arrow_type(fluss_type: &DataType) -> ArrowDataType { - match fluss_type { +pub fn to_arrow_type(fluss_type: &DataType) -> Result { + Ok(match fluss_type { DataType::Boolean(_) => ArrowDataType::Boolean, DataType::TinyInt(_) => ArrowDataType::Int8, DataType::SmallInt(_) => ArrowDataType::Int16, @@ -668,58 +736,91 @@ pub fn to_arrow_type(fluss_type: &DataType) -> ArrowDataType { DataType::Double(_) => ArrowDataType::Float64, DataType::Char(_) => ArrowDataType::Utf8, DataType::String(_) => ArrowDataType::Utf8, - DataType::Decimal(decimal_type) => ArrowDataType::Decimal128( - decimal_type - .precision() - .try_into() - .expect("precision exceeds u8::MAX"), - decimal_type + DataType::Decimal(decimal_type) => { + let precision = + decimal_type + .precision() + .try_into() + .map_err(|_| Error::IllegalArgument { + message: format!( + "Decimal precision {} exceeds Arrow's maximum (u8::MAX)", + decimal_type.precision() + ), + })?; + let scale = decimal_type .scale() .try_into() - .expect("scale exceeds i8::MAX"), - ), + .map_err(|_| Error::IllegalArgument { + message: format!( + "Decimal scale {} exceeds Arrow's maximum (i8::MAX)", + decimal_type.scale() + ), + })?; + ArrowDataType::Decimal128(precision, scale) + } DataType::Date(_) => ArrowDataType::Date32, DataType::Time(time_type) => match time_type.precision() { 0 => ArrowDataType::Time32(arrow_schema::TimeUnit::Second), 1..=3 => ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond), 4..=6 => ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond), 7..=9 => ArrowDataType::Time64(arrow_schema::TimeUnit::Nanosecond), - // This arm should never be reached due to validation in TimeType. - invalid => panic!("Invalid precision value for TimeType: {invalid}"), + invalid => { + return Err(Error::IllegalArgument { + message: format!("Invalid precision {} for TimeType (must be 0-9)", invalid), + }); + } }, DataType::Timestamp(timestamp_type) => match timestamp_type.precision() { 0 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None), 1..=3 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None), 4..=6 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None), 7..=9 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None), - // This arm should never be reached due to validation in Timestamp. - invalid => panic!("Invalid precision value for TimestampType: {invalid}"), + invalid => { + return Err(Error::IllegalArgument { + message: format!( + "Invalid precision {} for TimestampType (must be 0-9)", + invalid + ), + }); + } }, DataType::TimestampLTz(timestamp_ltz_type) => match timestamp_ltz_type.precision() { 0 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None), 1..=3 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None), 4..=6 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None), 7..=9 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None), - // This arm should never be reached due to validation in TimestampLTz. - invalid => panic!("Invalid precision value for TimestampLTzType: {invalid}"), + invalid => { + return Err(Error::IllegalArgument { + message: format!( + "Invalid precision {} for TimestampLTzType (must be 0-9)", + invalid + ), + }); + } }, DataType::Bytes(_) => ArrowDataType::Binary, - DataType::Binary(binary_type) => ArrowDataType::FixedSizeBinary( - binary_type + DataType::Binary(binary_type) => { + let length = binary_type .length() .try_into() - .expect("length exceeds i32::MAX"), - ), + .map_err(|_| Error::IllegalArgument { + message: format!( + "Binary length {} exceeds Arrow's maximum (i32::MAX)", + binary_type.length() + ), + })?; + ArrowDataType::FixedSizeBinary(length) + } DataType::Array(array_type) => ArrowDataType::List( Field::new_list_field( - to_arrow_type(array_type.get_element_type()), + to_arrow_type(array_type.get_element_type())?, fluss_type.is_nullable(), ) .into(), ), DataType::Map(map_type) => { - let key_type = to_arrow_type(map_type.key_type()); - let value_type = to_arrow_type(map_type.value_type()); + let key_type = to_arrow_type(map_type.key_type())?; + let value_type = to_arrow_type(map_type.value_type())?; let entry_fields = vec![ Field::new("key", key_type, map_type.key_type().is_nullable()), Field::new("value", value_type, map_type.value_type().is_nullable()), @@ -733,20 +834,21 @@ pub fn to_arrow_type(fluss_type: &DataType) -> ArrowDataType { false, ) } - DataType::Row(row_type) => ArrowDataType::Struct(arrow_schema::Fields::from( - row_type + DataType::Row(row_type) => { + let fields: Result> = row_type .fields() .iter() .map(|f| { - Field::new( + Ok(Field::new( f.name(), - to_arrow_type(f.data_type()), + to_arrow_type(f.data_type())?, f.data_type().is_nullable(), - ) + )) }) - .collect::>(), - )), - } + .collect(); + ArrowDataType::Struct(arrow_schema::Fields::from(fields?)) + } + }) } #[derive(Clone)] @@ -1059,81 +1161,114 @@ mod tests { #[test] fn test_to_array_type() { - assert_eq!(to_arrow_type(&DataTypes::boolean()), ArrowDataType::Boolean); - assert_eq!(to_arrow_type(&DataTypes::tinyint()), ArrowDataType::Int8); - assert_eq!(to_arrow_type(&DataTypes::smallint()), ArrowDataType::Int16); - assert_eq!(to_arrow_type(&DataTypes::bigint()), ArrowDataType::Int64); - assert_eq!(to_arrow_type(&DataTypes::int()), ArrowDataType::Int32); - assert_eq!(to_arrow_type(&DataTypes::float()), ArrowDataType::Float32); - assert_eq!(to_arrow_type(&DataTypes::double()), ArrowDataType::Float64); - assert_eq!(to_arrow_type(&DataTypes::char(16)), ArrowDataType::Utf8); - assert_eq!(to_arrow_type(&DataTypes::string()), ArrowDataType::Utf8); assert_eq!( - to_arrow_type(&DataTypes::decimal(10, 2)), + to_arrow_type(&DataTypes::boolean()).unwrap(), + ArrowDataType::Boolean + ); + assert_eq!( + to_arrow_type(&DataTypes::tinyint()).unwrap(), + ArrowDataType::Int8 + ); + assert_eq!( + to_arrow_type(&DataTypes::smallint()).unwrap(), + ArrowDataType::Int16 + ); + assert_eq!( + to_arrow_type(&DataTypes::bigint()).unwrap(), + ArrowDataType::Int64 + ); + assert_eq!( + to_arrow_type(&DataTypes::int()).unwrap(), + ArrowDataType::Int32 + ); + assert_eq!( + to_arrow_type(&DataTypes::float()).unwrap(), + ArrowDataType::Float32 + ); + assert_eq!( + to_arrow_type(&DataTypes::double()).unwrap(), + ArrowDataType::Float64 + ); + assert_eq!( + to_arrow_type(&DataTypes::char(16)).unwrap(), + ArrowDataType::Utf8 + ); + assert_eq!( + to_arrow_type(&DataTypes::string()).unwrap(), + ArrowDataType::Utf8 + ); + assert_eq!( + to_arrow_type(&DataTypes::decimal(10, 2)).unwrap(), ArrowDataType::Decimal128(10, 2) ); - assert_eq!(to_arrow_type(&DataTypes::date()), ArrowDataType::Date32); assert_eq!( - to_arrow_type(&DataTypes::time()), + to_arrow_type(&DataTypes::date()).unwrap(), + ArrowDataType::Date32 + ); + assert_eq!( + to_arrow_type(&DataTypes::time()).unwrap(), ArrowDataType::Time32(arrow_schema::TimeUnit::Second) ); assert_eq!( - to_arrow_type(&DataTypes::time_with_precision(3)), + to_arrow_type(&DataTypes::time_with_precision(3)).unwrap(), ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond) ); assert_eq!( - to_arrow_type(&DataTypes::time_with_precision(6)), + to_arrow_type(&DataTypes::time_with_precision(6)).unwrap(), ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond) ); assert_eq!( - to_arrow_type(&DataTypes::time_with_precision(9)), + to_arrow_type(&DataTypes::time_with_precision(9)).unwrap(), ArrowDataType::Time64(arrow_schema::TimeUnit::Nanosecond) ); assert_eq!( - to_arrow_type(&DataTypes::timestamp_with_precision(0)), + to_arrow_type(&DataTypes::timestamp_with_precision(0)).unwrap(), ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None) ); assert_eq!( - to_arrow_type(&DataTypes::timestamp_with_precision(3)), + to_arrow_type(&DataTypes::timestamp_with_precision(3)).unwrap(), ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None) ); assert_eq!( - to_arrow_type(&DataTypes::timestamp_with_precision(6)), + to_arrow_type(&DataTypes::timestamp_with_precision(6)).unwrap(), ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) ); assert_eq!( - to_arrow_type(&DataTypes::timestamp_with_precision(9)), + to_arrow_type(&DataTypes::timestamp_with_precision(9)).unwrap(), ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None) ); assert_eq!( - to_arrow_type(&DataTypes::timestamp_ltz_with_precision(0)), + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(0)).unwrap(), ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None) ); assert_eq!( - to_arrow_type(&DataTypes::timestamp_ltz_with_precision(3)), + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(3)).unwrap(), ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None) ); assert_eq!( - to_arrow_type(&DataTypes::timestamp_ltz_with_precision(6)), + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(6)).unwrap(), ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) ); assert_eq!( - to_arrow_type(&DataTypes::timestamp_ltz_with_precision(9)), + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(9)).unwrap(), ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None) ); - assert_eq!(to_arrow_type(&DataTypes::bytes()), ArrowDataType::Binary); assert_eq!( - to_arrow_type(&DataTypes::binary(16)), + to_arrow_type(&DataTypes::bytes()).unwrap(), + ArrowDataType::Binary + ); + assert_eq!( + to_arrow_type(&DataTypes::binary(16)).unwrap(), ArrowDataType::FixedSizeBinary(16) ); assert_eq!( - to_arrow_type(&DataTypes::array(DataTypes::int())), + to_arrow_type(&DataTypes::array(DataTypes::int())).unwrap(), ArrowDataType::List(Field::new_list_field(ArrowDataType::Int32, true).into()) ); assert_eq!( - to_arrow_type(&DataTypes::map(DataTypes::string(), DataTypes::int())), + to_arrow_type(&DataTypes::map(DataTypes::string(), DataTypes::int())).unwrap(), ArrowDataType::Map( Arc::new(Field::new( "entries", @@ -1151,7 +1286,8 @@ mod tests { to_arrow_type(&DataTypes::row(vec![ DataTypes::field("f1".to_string(), DataTypes::int()), DataTypes::field("f2".to_string(), DataTypes::string()), - ])), + ])) + .unwrap(), ArrowDataType::Struct(arrow_schema::Fields::from(vec![ Field::new("f1", ArrowDataType::Int32, true), Field::new("f2", ArrowDataType::Utf8, true), @@ -1215,7 +1351,7 @@ mod tests { DataField::new("id".to_string(), DataTypes::int(), None), DataField::new("name".to_string(), DataTypes::string(), None), ]); - let schema = to_arrow_schema(&row_type); + let schema = to_arrow_schema(&row_type).unwrap(); let result = ReadContext::with_projection_pushdown(schema, vec![0, 2], false); assert!(matches!(result, Err(IllegalArgument { .. }))); @@ -1249,4 +1385,209 @@ mod tests { } out } + + #[test] + fn test_temporal_and_decimal_builder_validation() { + use arrow::array::Array; + + // Test valid builder creation with precision=10, scale=2 + let mut builder = + RowAppendRecordBatchBuilder::create_builder(&ArrowDataType::Decimal128(10, 2)).unwrap(); + let decimal_builder = builder + .as_any_mut() + .downcast_mut::() + .expect("Expected Decimal128Builder"); + // Verify precision and scale + let array = decimal_builder.finish(); + assert_eq!(array.data_type(), &ArrowDataType::Decimal128(10, 2)); + + // Test error case: invalid precision/scale + let result = + RowAppendRecordBatchBuilder::create_builder(&ArrowDataType::Decimal128(100, 50)); + assert!(result.is_err()); + } + + #[test] + fn test_decimal_rescaling_and_validation() -> Result<()> { + use crate::row::{Datum, Decimal, GenericRow}; + use arrow::array::Decimal128Array; + use bigdecimal::BigDecimal; + use std::str::FromStr; + + // Test 1: Rescaling from scale 3 to scale 2 + let row_type = RowType::new(vec![DataField::new( + "amount".to_string(), + DataTypes::decimal(10, 2), + None, + )]); + let mut builder = RowAppendRecordBatchBuilder::new(&row_type)?; + let decimal = Decimal::from_big_decimal(BigDecimal::from_str("123.456").unwrap(), 10, 3)?; + builder.append(&GenericRow { + values: vec![Datum::Decimal(decimal)], + })?; + let batch = builder.build_arrow_record_batch()?; + let array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(array.value(0), 12346); // 123.456 rounded to 2 decimal places + assert_eq!(array.scale(), 2); + + // Test 2: Precision overflow (should error) + let row_type = RowType::new(vec![DataField::new( + "amount".to_string(), + DataTypes::decimal(5, 2), + None, + )]); + let mut builder = RowAppendRecordBatchBuilder::new(&row_type)?; + let decimal = Decimal::from_big_decimal(BigDecimal::from_str("123456.78").unwrap(), 10, 2)?; + let result = builder.append(&GenericRow { + values: vec![Datum::Decimal(decimal)], + }); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("precision overflow") + ); + + Ok(()) + } + + #[test] + fn test_all_types_end_to_end() -> Result<()> { + use crate::row::{Date, Datum, Decimal, GenericRow, Time, TimestampLtz, TimestampNtz}; + use arrow::array::{ + Date32Array, Decimal128Array, Int32Array, Time32MillisecondArray, + Time64NanosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, + }; + use bigdecimal::BigDecimal; + use std::str::FromStr; + + // Schema with int, decimal, date, time (ms + ns), timestamps (μs + ns) + let row_type = RowType::new(vec![ + DataField::new("id".to_string(), DataTypes::int(), None), + DataField::new("amount".to_string(), DataTypes::decimal(10, 2), None), + DataField::new("date".to_string(), DataTypes::date(), None), + DataField::new( + "time_ms".to_string(), + DataTypes::time_with_precision(3), + None, + ), + DataField::new( + "time_ns".to_string(), + DataTypes::time_with_precision(9), + None, + ), + DataField::new( + "ts_us".to_string(), + DataTypes::timestamp_with_precision(6), + None, + ), + DataField::new( + "ts_ltz_ns".to_string(), + DataTypes::timestamp_ltz_with_precision(9), + None, + ), + ]); + + let mut builder = RowAppendRecordBatchBuilder::new(&row_type)?; + + // Append rows with various data types + builder.append(&GenericRow { + values: vec![ + Datum::Int32(1), + Datum::Decimal(Decimal::from_big_decimal( + BigDecimal::from_str("123.456").unwrap(), + 10, + 3, + )?), + // 18000 days since epoch = 2019-04-14 + Datum::Date(Date::new(18000)), + // 43200000 ms = 12:00:00.000 (noon) + Datum::Time(Time::new(43200000)), + // 12345 ms = 00:00:12.345 + Datum::Time(Time::new(12345)), + // 1609459200000 ms = 2021-01-01 00:00:00 UTC, with 123456 additional nanoseconds + Datum::TimestampNtz(TimestampNtz::from_millis_nanos(1609459200000, 123456)?), + // 1609459200000 ms = 2021-01-01 00:00:00 UTC, with 987654 additional nanoseconds + Datum::TimestampLtz(TimestampLtz::from_millis_nanos(1609459200000, 987654)?), + ], + })?; + + let batch = builder.build_arrow_record_batch()?; + + // Verify all conversions + assert_eq!( + batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 1 + ); + + let dec = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(dec.value(0), 12346); // 123.456 rounded to 2 decimal places + + assert_eq!( + batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 18000 + ); + + assert_eq!( + batch + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 43200000 + ); + + assert_eq!( + batch + .column(4) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 12345000000 + ); + + // Timestamp with sub-millisecond nanos preserved + assert_eq!( + batch + .column(5) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 1609459200000123 + ); + + assert_eq!( + batch + .column(6) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 1609459200000987654 + ); + + Ok(()) + } } diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs index 5b21b3899b..7b3850f880 100644 --- a/fluss-rust/crates/fluss/src/row/datum.rs +++ b/fluss-rust/crates/fluss/src/row/datum.rs @@ -19,9 +19,13 @@ use crate::error::Error::RowConvertError; use crate::error::Result; use crate::row::Decimal; use arrow::array::{ - ArrayBuilder, BinaryBuilder, BooleanBuilder, Float32Builder, Float64Builder, Int8Builder, - Int16Builder, Int32Builder, Int64Builder, StringBuilder, + ArrayBuilder, BinaryBuilder, BooleanBuilder, Date32Builder, Decimal128Builder, Float32Builder, + Float64Builder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, StringBuilder, + Time32MillisecondBuilder, Time32SecondBuilder, Time64MicrosecondBuilder, + Time64NanosecondBuilder, TimestampMicrosecondBuilder, TimestampMillisecondBuilder, + TimestampNanosecondBuilder, TimestampSecondBuilder, }; +use arrow::datatypes as arrow_schema; use jiff::ToSpan; use ordered_float::OrderedFloat; use parse_display::Display; @@ -83,6 +87,41 @@ impl Datum<'_> { _ => panic!("not a blob: {self:?}"), } } + + pub fn as_decimal(&self) -> &Decimal { + match self { + Self::Decimal(d) => d, + _ => panic!("not a decimal: {self:?}"), + } + } + + pub fn as_date(&self) -> Date { + match self { + Self::Date(d) => *d, + _ => panic!("not a date: {self:?}"), + } + } + + pub fn as_time(&self) -> Time { + match self { + Self::Time(t) => *t, + _ => panic!("not a time: {self:?}"), + } + } + + pub fn as_timestamp_ntz(&self) -> TimestampNtz { + match self { + Self::TimestampNtz(ts) => *ts, + _ => panic!("not a timestamp ntz: {self:?}"), + } + } + + pub fn as_timestamp_ltz(&self) -> TimestampLtz { + match self { + Self::TimestampLtz(ts) => *ts, + _ => panic!("not a timestamp ltz: {self:?}"), + } + } } // ----------- implement from @@ -246,6 +285,66 @@ impl TryFrom<&Datum<'_>> for i8 { } } +impl TryFrom<&Datum<'_>> for Decimal { + type Error = (); + + #[inline] + fn try_from(from: &Datum) -> std::result::Result { + match from { + Datum::Decimal(d) => Ok(d.clone()), + _ => Err(()), + } + } +} + +impl TryFrom<&Datum<'_>> for Date { + type Error = (); + + #[inline] + fn try_from(from: &Datum) -> std::result::Result { + match from { + Datum::Date(d) => Ok(*d), + _ => Err(()), + } + } +} + +impl TryFrom<&Datum<'_>> for Time { + type Error = (); + + #[inline] + fn try_from(from: &Datum) -> std::result::Result { + match from { + Datum::Time(t) => Ok(*t), + _ => Err(()), + } + } +} + +impl TryFrom<&Datum<'_>> for TimestampNtz { + type Error = (); + + #[inline] + fn try_from(from: &Datum) -> std::result::Result { + match from { + Datum::TimestampNtz(ts) => Ok(*ts), + _ => Err(()), + } + } +} + +impl TryFrom<&Datum<'_>> for TimestampLtz { + type Error = (); + + #[inline] + fn try_from(from: &Datum) -> std::result::Result { + match from { + Datum::TimestampLtz(ts) => Ok(*ts), + _ => Err(()), + } + } +} + impl<'a> From for Datum<'a> { #[inline] fn from(b: bool) -> Datum<'a> { @@ -253,12 +352,103 @@ impl<'a> From for Datum<'a> { } } +impl<'a> From for Datum<'a> { + #[inline] + fn from(d: Decimal) -> Datum<'a> { + Datum::Decimal(d) + } +} + +impl<'a> From for Datum<'a> { + #[inline] + fn from(d: Date) -> Datum<'a> { + Datum::Date(d) + } +} + +impl<'a> From