Skip to content

Too slow row counting for feature files #11

@h0wl34

Description

@h0wl34

Counting rows for all feature files is highly inefficient as it loads them to memory first. A quick fix I made using a buffer:

def _fast_count_rows(file_paths: list[Path]) -> int:
    total_lines = 0tr
    pbar = tqdm.tqdm(file_paths, desc="Counting rows")
    for file_path in pbar:
        pbar.set_postfix_str(f"{file_path.name}")
        with file_path.open("rb") as f:
            lines = 0
            buf_size = 1024 * 1024  # 1MB buffer
            read_f = f.raw.read
            buf = read_f(buf_size)
            while buf:
                lines += buf.count(b'\n')
                buf = read_f(buf_size)
            total_lines += lines
    return total_lines

and in model.create_vectorized_features():

# ...
print("Preparing to vectorize raw features")
X_train_path = data_path / "X_train.dat"
y_train_path = data_path / "y_train.dat"
train_feature_paths = gather_feature_paths(data_path, "train")
# train_nrows = sum([1 for fp in train_feature_paths for _ in fp.open()])
train_nrows = _fast_count_rows(train_feature_paths) # monkey patch

X_test_path = data_path / "X_test.dat"
y_test_path = data_path / "y_test.dat"
test_feature_paths = gather_feature_paths(data_path, "test")
# test_nrows = sum([1 for fp in test_feature_paths for _ in fp.open()])
test_nrows = _fast_count_rows(test_feature_paths) # monkey patch
# ...

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions