-
Notifications
You must be signed in to change notification settings - Fork 23
[ISSUE-153] Add blocking poll into python bindings #154
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
fresh-borzoni
wants to merge
2
commits into
apache:main
Choose a base branch
from
fresh-borzoni:pythonic-poll
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,10 +20,13 @@ use crate::*; | |
| use arrow::array::RecordBatch; | ||
| use arrow_pyarrow::{FromPyArrow, ToPyArrow}; | ||
| use fluss::client::EARLIEST_OFFSET; | ||
| use fluss::record::to_arrow_schema; | ||
| use fluss::rpc::message::OffsetSpec; | ||
| use pyo3::types::IntoPyDict; | ||
| use pyo3_async_runtimes::tokio::future_into_py; | ||
| use std::collections::HashMap; | ||
| use std::sync::Arc; | ||
| use std::time::Duration; | ||
|
|
||
| // Time conversion constants | ||
| const MILLIS_PER_SECOND: i64 = 1_000; | ||
|
|
@@ -186,7 +189,7 @@ impl FlussTable { | |
| } | ||
|
|
||
| let rust_scanner = table_scan | ||
| .create_log_scanner() | ||
| .create_record_batch_log_scanner() | ||
| .map_err(|e| FlussError::new_err(format!("Failed to create log scanner: {e}")))?; | ||
|
|
||
| let admin = conn | ||
|
|
@@ -888,7 +891,7 @@ fn get_type_name(value: &Bound<PyAny>) -> String { | |
| /// Scanner for reading log data from a Fluss table | ||
| #[pyclass] | ||
| pub struct LogScanner { | ||
| inner: fcore::client::LogScanner, | ||
| inner: fcore::client::RecordBatchLogScanner, | ||
| admin: fcore::client::FlussAdmin, | ||
| table_info: fcore::metadata::TableInfo, | ||
| #[allow(dead_code)] | ||
|
|
@@ -933,9 +936,6 @@ impl LogScanner { | |
|
|
||
| /// Convert all data to Arrow Table | ||
| fn to_arrow(&self, py: Python) -> PyResult<Py<PyAny>> { | ||
| use std::collections::HashMap; | ||
| use std::time::Duration; | ||
|
|
||
| let mut all_batches = Vec::new(); | ||
|
|
||
| let num_buckets = self.table_info.get_num_buckets(); | ||
|
|
@@ -960,31 +960,56 @@ impl LogScanner { | |
| .block_on(async { self.inner.poll(Duration::from_millis(500)).await }); | ||
|
|
||
| match batch_result { | ||
| Ok(scan_records) => { | ||
| let mut result_records: Vec<fcore::record::ScanRecord> = vec![]; | ||
| for (bucket, records) in scan_records.into_records_by_buckets() { | ||
| let stopping_offset = stopping_offsets.get(&bucket.bucket_id()); | ||
|
|
||
| if stopping_offset.is_none() { | ||
| // not to include this bucket, skip records for this bucket | ||
| // since we already reach end offset for this bucket | ||
| continue; | ||
| } | ||
| if let Some(last_record) = records.last() { | ||
| let offset = last_record.offset(); | ||
| result_records.extend(records); | ||
| if offset >= stopping_offset.unwrap() - 1 { | ||
| stopping_offsets.remove(&bucket.bucket_id()); | ||
| Ok(scan_batches) => { | ||
| for scan_batch in scan_batches { | ||
| let bucket_id = scan_batch.bucket().bucket_id(); | ||
|
|
||
| // Extract stopping_offset once to avoid double unwrap | ||
| let stop_exclusive = match stopping_offsets.get(&bucket_id) { | ||
| Some(&offset) => offset, | ||
| None => { | ||
| // we already reached end offset for this bucket | ||
| continue; | ||
| } | ||
| } | ||
| } | ||
| }; | ||
|
|
||
| // Compute the inclusive last offset we want (stop_exclusive - 1) | ||
| let stop_inclusive = match stop_exclusive.checked_sub(1) { | ||
| Some(v) => v, | ||
| None => { | ||
| // stop_exclusive was 0 or negative - nothing to read | ||
| stopping_offsets.remove(&bucket_id); | ||
| continue; | ||
| } | ||
| }; | ||
|
|
||
| let base_offset = scan_batch.base_offset(); | ||
| let last_offset = scan_batch.last_offset(); | ||
|
|
||
| // Check if we need to slice this batch to avoid overshoot | ||
| let batch = if last_offset > stop_inclusive { | ||
| // This batch extends past our stopping point | ||
| // Slice to only include records up to stop_inclusive | ||
| let records_to_keep = (stop_inclusive - base_offset + 1) as usize; | ||
|
|
||
| let full_batch = scan_batch.into_batch(); | ||
| let actual_rows = full_batch.num_rows(); | ||
|
|
||
| full_batch.slice(0, records_to_keep.min(actual_rows)) | ||
| } else { | ||
| // This batch is entirely before our stopping point | ||
| scan_batch.into_batch() | ||
| }; | ||
|
|
||
| if !result_records.is_empty() { | ||
| let arrow_batch = Utils::convert_scan_records_to_arrow(result_records); | ||
| all_batches.extend(arrow_batch); | ||
| all_batches.push(Arc::new(batch)); | ||
|
|
||
| // Remove this bucket if we've reached or passed the stopping offset | ||
| if last_offset >= stop_inclusive { | ||
| stopping_offsets.remove(&bucket_id); | ||
| } | ||
| } | ||
|
|
||
| // we have reach end offsets of all bucket | ||
| // we have reached end offsets of all buckets | ||
| if stopping_offsets.is_empty() { | ||
| break; | ||
| } | ||
|
|
@@ -1006,15 +1031,68 @@ impl LogScanner { | |
| Ok(df) | ||
| } | ||
|
|
||
| /// Poll for new records with the specified timeout | ||
| /// | ||
| /// Args: | ||
| /// timeout_ms: Timeout in milliseconds to wait for records | ||
| /// | ||
| /// Returns: | ||
| /// PyArrow Table containing the polled records | ||
| /// | ||
| /// Note: | ||
| /// - Returns an empty table (with correct schema) if no records are available | ||
| /// - When timeout expires, returns an empty table (NOT an error) | ||
| fn poll(&self, py: Python, timeout_ms: i64) -> PyResult<Py<PyAny>> { | ||
| if timeout_ms < 0 { | ||
| return Err(FlussError::new_err(format!( | ||
| "timeout_ms must be non-negative, got: {timeout_ms}" | ||
| ))); | ||
| } | ||
|
|
||
| let timeout = Duration::from_millis(timeout_ms as u64); | ||
| let scan_batches = py | ||
| .detach(|| TOKIO_RUNTIME.block_on(async { self.inner.poll(timeout).await })) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually, I'm thinking can we use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. makes sense, thank you! |
||
| .map_err(|e| FlussError::new_err(e.to_string()))?; | ||
|
|
||
| // Convert ScanBatch to Arrow batches | ||
| if scan_batches.is_empty() { | ||
| return self.create_empty_table(py); | ||
| } | ||
|
|
||
| let arrow_batches: Vec<_> = scan_batches | ||
| .into_iter() | ||
| .map(|scan_batch| Arc::new(scan_batch.into_batch())) | ||
| .collect(); | ||
|
|
||
| Utils::combine_batches_to_table(py, arrow_batches) | ||
| } | ||
|
|
||
| /// Create an empty PyArrow table with the correct schema | ||
| fn create_empty_table(&self, py: Python) -> PyResult<Py<PyAny>> { | ||
| let arrow_schema = to_arrow_schema(self.table_info.get_row_type()) | ||
| .map_err(|e| FlussError::new_err(format!("Failed to get arrow schema: {e}")))?; | ||
| let py_schema = arrow_schema | ||
| .as_ref() | ||
| .to_pyarrow(py) | ||
| .map_err(|e| FlussError::new_err(format!("Failed to convert schema: {e}")))?; | ||
|
|
||
| let pyarrow = py.import("pyarrow")?; | ||
| let empty_table = pyarrow | ||
| .getattr("Table")? | ||
| .call_method1("from_batches", (vec![] as Vec<Py<PyAny>>, py_schema))?; | ||
|
|
||
| Ok(empty_table.into()) | ||
| } | ||
|
|
||
| fn __repr__(&self) -> String { | ||
| format!("LogScanner(table={})", self.table_info.table_path) | ||
| } | ||
| } | ||
|
|
||
| impl LogScanner { | ||
| /// Create LogScanner from core LogScanner | ||
| /// Create LogScanner from core RecordBatchLogScanner | ||
| pub fn from_core( | ||
| inner_scanner: fcore::client::LogScanner, | ||
| inner_scanner: fcore::client::RecordBatchLogScanner, | ||
| admin: fcore::client::FlussAdmin, | ||
| table_info: fcore::metadata::TableInfo, | ||
| ) -> Self { | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.