diff --git a/.gitignore b/.gitignore index 3b63de3..31f2232 100644 --- a/.gitignore +++ b/.gitignore @@ -174,4 +174,6 @@ fastTextAttention.py poetry.lock # vscode -.vscode/ \ No newline at end of file +.vscode/ + +benchmark_results/ diff --git a/README.md b/README.md index 73810a9..95fa297 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,18 @@ # torchTextClassifiers -A unified, extensible framework for text classification built on [PyTorch](https://pytorch.org/) and [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/). +A unified, extensible framework for text classification with categorical variables built on [PyTorch](https://pytorch.org/) and [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/). ## 🚀 Features -- **Unified API**: Consistent interface for different classifier wrappers -- **Extensible**: Easy to add new classifier implementations through wrapper pattern -- **FastText Support**: Built-in FastText classifier with n-gram tokenization -- **Flexible Preprocessing**: Each classifier can implement its own text preprocessing approach +- **Mixed input support**: Handle text data alongside categorical variables seamlessly. +- **Unified yet highly customizable**: + - Use any tokenizer from HuggingFace or the original fastText's ngram tokenizer. + - Manipulate the components (`TextEmbedder`, `CategoricalVariableNet`, `ClassificationHead`) to easily create custom architectures - including **self-attention**. All of them are `torch.nn.Module` ! + - The `TextClassificationModel` class combines these components and can be extended for custom behavior. - **PyTorch Lightning**: Automated training with callbacks, early stopping, and logging +- **Easy experimentation**: Simple API for training, evaluating, and predicting with minimal code: + - The `torchTextClassifiers` wrapper class orchestrates the tokenizer and the model for you +- **Additional features**: explainability using Captum ## 📦 Installation @@ -25,129 +29,9 @@ uv sync pip install -e . ``` -## 🎯 Quick Start - -### Basic FastText Classification - -```python -import numpy as np -from torchTextClassifiers import create_fasttext - -# Create a FastText classifier -classifier = create_fasttext( - embedding_dim=100, - sparse=False, - num_tokens=10000, - min_count=2, - min_n=3, - max_n=6, - len_word_ngrams=2, - num_classes=2 -) - -# Prepare your data -X_train = np.array([ - "This is a positive example", - "This is a negative example", - "Another positive case", - "Another negative case" -]) -y_train = np.array([1, 0, 1, 0]) - -X_val = np.array([ - "Validation positive", - "Validation negative" -]) -y_val = np.array([1, 0]) - -# Build the model -classifier.build(X_train, y_train) - -# Train the model -classifier.train( - X_train, y_train, X_val, y_val, - num_epochs=50, - batch_size=32, - patience_train=5, - verbose=True -) - -# Make predictions -X_test = np.array(["This is a test sentence"]) -predictions = classifier.predict(X_test) -print(f"Predictions: {predictions}") - -# Validate on test set -accuracy = classifier.validate(X_test, np.array([1])) -print(f"Accuracy: {accuracy:.3f}") -``` - -### Custom Classifier Implementation - -```python -import numpy as np -from torchTextClassifiers import torchTextClassifiers -from torchTextClassifiers.classifiers.simple_text_classifier import SimpleTextWrapper, SimpleTextConfig - -# Example: TF-IDF based classifier (alternative to tokenization) -config = SimpleTextConfig( - hidden_dim=128, - num_classes=2, - max_features=5000, - learning_rate=1e-3, - dropout_rate=0.2 -) - -# Create classifier with TF-IDF preprocessing -wrapper = SimpleTextWrapper(config) -classifier = torchTextClassifiers(wrapper) - -# Text data -X_train = np.array(["Great product!", "Terrible service", "Love it!"]) -y_train = np.array([1, 0, 1]) - -# Build and train -classifier.build(X_train, y_train) -# ... continue with training -``` - - -### Training Customization - -```python -# Custom PyTorch Lightning trainer parameters -trainer_params = { - 'accelerator': 'gpu', - 'devices': 1, - 'precision': 16, # Mixed precision training - 'gradient_clip_val': 1.0, -} - -classifier.train( - X_train, y_train, X_val, y_val, - num_epochs=100, - batch_size=64, - patience_train=10, - trainer_params=trainer_params, - verbose=True -) -``` - -## 🔬 Testing - -Run the test suite: - -```bash -# Run all tests -uv run pytest - -# Run with coverage -uv run pytest --cov=torchTextClassifiers - -# Run specific test file -uv run pytest tests/test_torchTextClassifiers.py -v -``` +## 📝 Usage +Checkout the [notebook](notebooks/example.ipynb) for a quick start. ## 📚 Examples @@ -155,10 +39,8 @@ See the [examples/](examples/) directory for: - Basic text classification - Multi-class classification - Mixed features (text + categorical) -- Custom classifier implementation - Advanced training configurations - - +- Prediction and explainability ## 📄 License diff --git a/examples/advanced_training.py b/examples/advanced_training.py index e03f075..a1a21bb 100644 --- a/examples/advanced_training.py +++ b/examples/advanced_training.py @@ -6,13 +6,44 @@ and training monitoring. """ +import os +import random +import warnings + import numpy as np -from torchTextClassifiers import create_fasttext +import torch +from pytorch_lightning import seed_everything + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer def main(): + # Set seed for reproducibility + SEED = 42 + + # Set environment variables for full reproducibility + os.environ['PYTHONHASHSEED'] = str(SEED) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + + # Use PyTorch Lightning's seed_everything for comprehensive seeding + seed_everything(SEED, workers=True) + + # Make PyTorch operations deterministic + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.use_deterministic_algorithms(True, warn_only=True) + + # Suppress PyTorch Lightning warnings for cleaner output + warnings.filterwarnings( + 'ignore', + message='.*', + category=UserWarning, + module='pytorch_lightning' + ) + print("⚙️ Advanced Training Configuration Example") print("=" * 50) - + # Create a larger dataset for demonstrating advanced training print("📝 Creating training dataset...") @@ -67,54 +98,63 @@ def main(): print(f"Training samples: {len(X_train)}") print(f"Validation samples: {len(X_val)}") print(f"Test samples: {len(X_test)}") - - # Create FastText classifier - print("\n🏗️ Creating FastText classifier...") - classifier = create_fasttext( + + # Create and train tokenizer (shared across all examples) + print("\n🏗️ Creating and training WordPiece tokenizer...") + tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=128) + training_corpus = X_train.tolist() + tokenizer.train(training_corpus) + print("✅ Tokenizer trained successfully!") + + # Example 1: Basic training with default settings + print("\n🎯 Example 1: Basic training with default settings...") + + model_config = ModelConfig( embedding_dim=100, - sparse=False, - num_tokens=10000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, num_classes=2 ) - - # Build the model - print("\n🔨 Building model...") - classifier.build(X_train, y_train) - print("✅ Model built successfully!") - - # Example 1: Basic training with default settings - print("\n🎯 Example 1: Basic training with default settings...") - classifier.train( - X_train, y_train, X_val, y_val, + + classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config + ) + print("✅ Classifier created successfully!") + + training_config = TrainingConfig( num_epochs=15, batch_size=8, - patience_train=5, + lr=1e-3, + patience_early_stopping=5, + num_workers=0, + trainer_params={'deterministic': True} + ) + + classifier.train( + X_train, y_train, X_val, y_val, + training_config=training_config, verbose=True ) - - basic_accuracy = classifier.validate(X_test, y_test) + + result = classifier.predict(X_test) + basic_predictions = result["prediction"].squeeze().numpy() + basic_accuracy = (basic_predictions == y_test).mean() print(f"✅ Basic training completed! Accuracy: {basic_accuracy:.3f}") # Example 2: Advanced training with custom Lightning trainer parameters print("\n🚀 Example 2: Advanced training with custom parameters...") - + # Create a new classifier for comparison - advanced_classifier = create_fasttext( + advanced_model_config = ModelConfig( embedding_dim=100, - sparse=False, - num_tokens=10000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, num_classes=2 ) - advanced_classifier.build(X_train, y_train) - + + advanced_classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=advanced_model_config + ) + print("✅ Advanced classifier created successfully!") + # Custom trainer parameters for advanced features advanced_trainer_params = { 'accelerator': 'auto', # Use GPU if available, else CPU @@ -125,62 +165,77 @@ def main(): 'enable_progress_bar': True, # Show progress bar 'log_every_n_steps': 5, # Log every 5 steps } - - advanced_classifier.train( - X_train, y_train, X_val, y_val, + + advanced_training_config = TrainingConfig( num_epochs=20, batch_size=4, # Smaller batch size with grad accumulation - patience_train=7, - trainer_params=advanced_trainer_params, + lr=1e-3, + patience_early_stopping=7, + num_workers=0, + cpu_run=False, # Don't override accelerator from trainer_params + trainer_params=advanced_trainer_params + ) + + advanced_classifier.train( + X_train, y_train, X_val, y_val, + training_config=advanced_training_config, verbose=True ) - - advanced_accuracy = advanced_classifier.validate(X_test, y_test) + + advanced_result = advanced_classifier.predict(X_test) + advanced_predictions = advanced_result["prediction"].squeeze().numpy() + advanced_accuracy = (advanced_predictions == y_test).mean() print(f"✅ Advanced training completed! Accuracy: {advanced_accuracy:.3f}") # Example 3: Training with CPU-only (useful for small datasets or debugging) print("\n💻 Example 3: CPU-only training...") - - cpu_classifier = create_fasttext( + + cpu_model_config = ModelConfig( embedding_dim=64, # Smaller embedding for faster CPU training - sparse=True, # Sparse embeddings for efficiency - num_tokens=5000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, num_classes=2 ) - cpu_classifier.build(X_train, y_train) - - cpu_classifier.train( - X_train, y_train, X_val, y_val, + + cpu_classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=cpu_model_config + ) + print("✅ CPU classifier created successfully!") + + cpu_training_config = TrainingConfig( num_epochs=10, batch_size=16, # Larger batch size for CPU - cpu_run=True, # Force CPU usage + lr=1e-3, + patience_early_stopping=3, + cpu_run=False, # Don't override accelerator from trainer_params num_workers=0, # No multiprocessing for CPU - patience_train=3, + trainer_params={'deterministic': True, 'accelerator': 'cpu'} + ) + + cpu_classifier.train( + X_train, y_train, X_val, y_val, + training_config=cpu_training_config, verbose=True ) - - cpu_accuracy = cpu_classifier.validate(X_test, y_test) + + cpu_result = cpu_classifier.predict(X_test) + cpu_predictions = cpu_result["prediction"].squeeze().numpy() + cpu_accuracy = (cpu_predictions == y_test).mean() print(f"✅ CPU training completed! Accuracy: {cpu_accuracy:.3f}") # Example 4: Custom training with specific Lightning callbacks print("\n🔧 Example 4: Training with custom callbacks...") - - custom_classifier = create_fasttext( + + custom_model_config = ModelConfig( embedding_dim=128, - sparse=False, - num_tokens=8000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, num_classes=2 ) - custom_classifier.build(X_train, y_train) - + + custom_classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=custom_model_config + ) + print("✅ Custom classifier created successfully!") + # Custom trainer with specific monitoring and checkpointing custom_trainer_params = { 'max_epochs': 25, @@ -189,18 +244,27 @@ def main(): 'check_val_every_n_epoch': 2, # Validate every 2 epochs 'enable_checkpointing': True, 'enable_model_summary': True, + 'deterministic': True, } - - custom_classifier.train( - X_train, y_train, X_val, y_val, + + custom_training_config = TrainingConfig( num_epochs=25, batch_size=6, - patience_train=8, - trainer_params=custom_trainer_params, + lr=1e-3, + patience_early_stopping=8, + num_workers=0, + trainer_params=custom_trainer_params + ) + + custom_classifier.train( + X_train, y_train, X_val, y_val, + training_config=custom_training_config, verbose=True ) - - custom_accuracy = custom_classifier.validate(X_test, y_test) + + custom_result = custom_classifier.predict(X_test) + custom_predictions = custom_result["prediction"].squeeze().numpy() + custom_accuracy = (custom_predictions == y_test).mean() print(f"✅ Custom training completed! Accuracy: {custom_accuracy:.3f}") # Compare all training approaches diff --git a/examples/basic_classification.py b/examples/basic_classification.py index da0e5bc..2d1de9d 100644 --- a/examples/basic_classification.py +++ b/examples/basic_classification.py @@ -2,16 +2,31 @@ Basic Text Classification Example This example demonstrates how to use torchTextClassifiers for binary -text classification using the FastText classifier. +text classification using the Wrapper. """ +import os +import random +import warnings + import numpy as np -from torchTextClassifiers import create_fasttext +import torch +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer + def main(): + # Suppress PyTorch Lightning batch_size inference warnings for cleaner output + warnings.filterwarnings( + 'ignore', + message='.*', + category=UserWarning, + module='pytorch_lightning' + ) + print("🚀 Basic Text Classification Example") print("=" * 50) - + # Create sample data print("📝 Creating sample data...") X_train = np.array([ @@ -24,67 +39,114 @@ def main(): "Perfect! Exactly what I was looking for.", "Waste of money. Should have read reviews first.", "Outstanding product with excellent build quality.", - "Cheap plastic, feels like it will break soon." + "Cheap plastic, feels like it will break soon.", + "Absolutely fantastic! Exceeded all my expectations.", + "Horrible experience. Customer service was rude and unhelpful.", + "Best purchase I've made this year. Five stars!", + "Defective item arrived. Packaging was also damaged.", + "Super impressed with the performance and durability.", + "Total disappointment. Doesn't match the description at all.", + "Wonderful product! My whole family loves it.", + "Avoid at all costs. Complete waste of time and money.", + "Remarkable quality for the price. Very satisfied!", + "Broke within a week. Clearly poor manufacturing.", + "Exceptional value! Would definitely buy again.", + "Misleading photos. Product looks nothing like advertised.", + "Works like a charm. Installation was easy too.", + "Returned it immediately. Not worth even half the price.", + "Beautiful design and sturdy construction. Love it!", + "Arrived late and damaged. Very frustrating experience.", + "Top-notch quality! Highly recommend to everyone.", + "Uncomfortable and poorly made. Regret buying this.", + "Perfect fit and great finish. Couldn't be happier!", + "Stopped working after two uses. Complete junk." ]) - - y_train = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) # 1=positive, 0=negative + + y_train = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) # 1=positive, 0=negative # Validation data X_val = np.array([ "Good product, satisfied with purchase.", - "Not worth the money, poor quality." + "Not worth the money, poor quality.", + "Really happy with this purchase. Great item!", + "Disappointed with the quality. Expected better.", + "Solid product that does what it promises.", + "Don't waste your money on this. Very poor.", + "Impressive quality and quick delivery.", + "Malfunctioned right out of the box. Terrible." ]) - y_val = np.array([1, 0]) + y_val = np.array([1, 0, 1, 0, 1, 0, 1, 0]) # Test data X_test = np.array([ "This is an amazing product with great features!", "Completely disappointed with this purchase.", - "Excellent build quality and works as expected." + "Excellent build quality and works as expected.", + "Not recommended. Had issues from day one.", + "Fantastic product! Worth every penny.", + "Failed to meet basic expectations. Very poor.", + "Love it! Exactly as described and high quality.", + "Cheap materials and sloppy construction. Avoid.", + "Superb performance and easy to use. Highly satisfied!", + "Unreliable and frustrating. Should have bought elsewhere." ]) - y_test = np.array([1, 0, 1]) + y_test = np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) print(f"Training samples: {len(X_train)}") print(f"Validation samples: {len(X_val)}") print(f"Test samples: {len(X_test)}") - # Create FastText classifier - print("\n🏗️ Creating FastText classifier...") - classifier = create_fasttext( + # Create and train tokenizer + print("\n🏗️ Creating and training WordPiece tokenizer...") + tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=128) + + # Train tokenizer on the training corpus + training_corpus = X_train.tolist() + tokenizer.train(training_corpus) + print("✅ Tokenizer trained successfully!") + + # Create model configuration + print("\n🔧 Creating model configuration...") + model_config = ModelConfig( embedding_dim=50, - sparse=False, - num_tokens=5000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, num_classes=2 ) - - # Build the model - print("\n🔨 Building model...") - classifier.build(X_train, y_train) - print("✅ Model built successfully!") - + + # Create classifier + print("\n🔨 Creating classifier...") + classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config + ) + print("✅ Classifier created successfully!") + print(classifier) # Train the model print("\n🎯 Training model...") - classifier.train( - X_train, y_train, X_val, y_val, + training_config = TrainingConfig( num_epochs=20, batch_size=4, - patience_train=5, + lr=1e-3, + patience_early_stopping=5, + num_workers=0, # Use 0 for simple examples to avoid multiprocessing issues + ) + classifier.train( + X_train, y_train, X_val, y_val, + training_config=training_config, verbose=True ) print("✅ Training completed!") # Make predictions print("\n🔮 Making predictions...") - predictions = classifier.predict(X_test) + result = classifier.predict(X_test) + predictions = result["prediction"].squeeze().numpy() # Extract predictions from dictionary + confidence = result["confidence"].squeeze().numpy() # Extract confidence scores print(f"Predictions: {predictions}") + print(f"Confidence: {confidence}") print(f"True labels: {y_test}") - + # Calculate accuracy - accuracy = classifier.validate(X_test, y_test) + accuracy = (predictions == y_test).mean() print(f"Test accuracy: {accuracy:.3f}") # Show detailed results @@ -99,5 +161,6 @@ def main(): print("\n🎉 Example completed successfully!") + if __name__ == "__main__": main() \ No newline at end of file diff --git a/examples/multiclass_classification.py b/examples/multiclass_classification.py index e8863b6..8498e17 100644 --- a/examples/multiclass_classification.py +++ b/examples/multiclass_classification.py @@ -2,17 +2,48 @@ Multi-class Text Classification Example This example demonstrates multi-class text classification using -torchTextClassifiers with FastText for sentiment analysis with +torchTextClassifiers for sentiment analysis with 3 classes: positive, negative, and neutral. """ +import os +import random +import warnings + import numpy as np -from torchTextClassifiers import create_fasttext +import torch +from pytorch_lightning import seed_everything + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer def main(): + # Set seed for reproducibility + SEED = 42 + + # Set environment variables for full reproducibility + os.environ['PYTHONHASHSEED'] = str(SEED) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + + # Use PyTorch Lightning's seed_everything for comprehensive seeding + seed_everything(SEED, workers=True) + + # Make PyTorch operations deterministic + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.use_deterministic_algorithms(True, warn_only=True) + + # Suppress PyTorch Lightning warnings for cleaner output + warnings.filterwarnings( + 'ignore', + message='.*', + category=UserWarning, + module='pytorch_lightning' + ) + print("🎭 Multi-class Text Classification Example") print("=" * 50) - + # Create multi-class sample data (3 classes: 0=negative, 1=neutral, 2=positive) print("📝 Creating multi-class sentiment data...") X_train = np.array([ @@ -63,44 +94,55 @@ def main(): print(f"Training samples: {len(X_train)}") print(f"Class distribution: Negative={sum(y_train==0)}, Neutral={sum(y_train==1)}, Positive={sum(y_train==2)}") - - # Create FastText classifier for 3 classes - print("\n🏗️ Creating multi-class FastText classifier...") - classifier = create_fasttext( + + # Create and train tokenizer + print("\n🏗️ Creating and training WordPiece tokenizer...") + tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=128) + training_corpus = X_train.tolist() + tokenizer.train(training_corpus) + print("✅ Tokenizer trained successfully!") + + # Create model configuration for 3 classes + print("\n🔧 Creating model configuration...") + model_config = ModelConfig( embedding_dim=64, - sparse=False, - num_tokens=8000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, - num_classes=3 # 3 classes for sentiment + num_classes=3 # 3 classes for sentiment (negative, neutral, positive) ) - - # Build the model - print("\n🔨 Building model...") - classifier.build(X_train, y_train) - print("✅ Model built successfully!") - + + # Create classifier + print("\n🔨 Creating multi-class classifier...") + classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config + ) + print("✅ Classifier created successfully!") + # Train the model print("\n🎯 Training model...") - classifier.train( - X_train, y_train, X_val, y_val, + training_config = TrainingConfig( num_epochs=30, batch_size=8, - patience_train=7, + lr=1e-3, + patience_early_stopping=7, + num_workers=0, + trainer_params={'deterministic': True} + ) + classifier.train( + X_train, y_train, X_val, y_val, + training_config=training_config, verbose=True ) print("✅ Training completed!") # Make predictions print("\n🔮 Making predictions...") - predictions = classifier.predict(X_test) + result = classifier.predict(X_test) + predictions = result["prediction"].squeeze().numpy() print(f"Predictions: {predictions}") print(f"True labels: {y_test}") - + # Calculate accuracy - accuracy = classifier.validate(X_test, y_test) + accuracy = (predictions == y_test).mean() print(f"Test accuracy: {accuracy:.3f}") # Define class names for better output diff --git a/examples/simple_explainability_example.py b/examples/simple_explainability_example.py index c1c44c2..57bcadd 100644 --- a/examples/simple_explainability_example.py +++ b/examples/simple_explainability_example.py @@ -2,14 +2,48 @@ Simple Explainability Example with ASCII Visualization """ -import numpy as np +import os import sys -from torchTextClassifiers import create_fasttext +import warnings + +import numpy as np +import torch +from pytorch_lightning import seed_everything + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer +from torchTextClassifiers.utilities.plot_explainability import ( + map_attributions_to_char, + map_attributions_to_word, +) def main(): + # Set seed for reproducibility + SEED = 42 + + # Set environment variables for full reproducibility + os.environ['PYTHONHASHSEED'] = str(SEED) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + + # Use PyTorch Lightning's seed_everything for comprehensive seeding + seed_everything(SEED, workers=True) + + # Make PyTorch operations deterministic + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.use_deterministic_algorithms(True, warn_only=True) + + # Suppress PyTorch Lightning warnings for cleaner output + warnings.filterwarnings( + 'ignore', + message='.*', + category=UserWarning, + module='pytorch_lightning' + ) + print("🔍 Simple Explainability Example") - + # Enhanced training data with more diverse examples X_train = np.array([ # Positive examples @@ -55,29 +89,51 @@ def main(): ]) X_val = np.array([ - "Good product with decent quality", + "Good product with decent quality", "Bad quality and poor service", "Excellent value and great design", "Terrible experience and awful quality" ]) y_val = np.array([1, 0, 1, 0]) - - # Create classifier - classifier = create_fasttext( + + # Create and train tokenizer + print("\n🏗️ Creating and training WordPiece tokenizer...") + tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=128) + training_corpus = X_train.tolist() + tokenizer.train(training_corpus) + print("✅ Tokenizer trained successfully!") + + # Create model configuration + print("\n🔧 Creating model configuration...") + model_config = ModelConfig( embedding_dim=50, - sparse=False, - num_tokens=1000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, - num_classes=2, - direct_bagging=False # Required for explainability + num_classes=2 ) - - # Train - classifier.build(X_train, y_train) - classifier.train(X_train, y_train, X_val, y_val, num_epochs=25, batch_size=8, verbose=False) + + # Create classifier + print("\n🔨 Creating classifier...") + classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config + ) + print("✅ Classifier created successfully!") + + # Train the model + print("\n🎯 Training model...") + training_config = TrainingConfig( + num_epochs=25, + batch_size=8, + lr=1e-3, + patience_early_stopping=5, + num_workers=0, + trainer_params={'deterministic': True} + ) + classifier.train( + X_train, y_train, X_val, y_val, + training_config=training_config, + verbose=True + ) + print("✅ Training completed!") # Test examples with different sentiments test_texts = [ @@ -94,49 +150,70 @@ def main(): for i, test_text in enumerate(test_texts, 1): print(f"\n📝 Example {i}:") print(f"Text: '{test_text}'") - - # Get prediction - prediction = classifier.predict(np.array([test_text]))[0] - print(f"Prediction: {'Positive' if prediction == 1 else 'Negative'}") - - # Get explainability scores + + # Get prediction with explainability try: - pred, confidence, all_scores, all_scores_letters = classifier.predict_and_explain(np.array([test_text])) - - # Create ASCII histogram - if all_scores is not None and len(all_scores) > 0: - scores_data = all_scores[0][0] - if hasattr(scores_data, 'tolist'): - scores = scores_data.tolist() - else: - scores = [float(scores_data)] - - words = test_text.split() - - if len(words) == len(scores): - print("\n📊 Word Contribution Histogram:") - print("-" * 50) - - # Find max score for scaling - max_score = max(scores) if scores else 1 - bar_width = 30 # max bar width in characters - - for word, score in zip(words, scores): - # Calculate bar length - bar_length = int((score / max_score) * bar_width) - bar = "█" * bar_length - - # Format output - print(f"{word:>12} | {bar:<30} {score:.4f}") - - print("-" * 50) - else: - print(f"⚠️ Word/score mismatch: {len(words)} words vs {len(scores)} scores") - else: - print("⚠️ No explainability scores available") - + result = classifier.predict(np.array([test_text]), top_k=1, explain=True) + + # Extract prediction + prediction = result["prediction"][0][0].item() + confidence = result["confidence"][0][0].item() + print(f"Prediction: {'Positive' if prediction == 1 else 'Negative'} (confidence: {confidence:.4f})") + + # Extract attributions and mapping info + attributions = result["attributions"][0][0] # shape: (seq_len,) + offset_mapping = result["offset_mapping"][0] # List of (start, end) tuples + word_ids = result["word_ids"][0] # List of word IDs for each token + + # Map token-level attributions to character-level (for ASCII visualization) + char_attributions = map_attributions_to_char( + attributions.unsqueeze(0), # Add batch dimension: (1, seq_len) + offset_mapping, + test_text + )[0] # Get first result + + print("\n📊 Character-Level Contribution Visualization:") + print("-" * 60) + + # Create a simple ASCII visualization by character + max_attr = max(char_attributions) if len(char_attributions) > 0 else 1 + bar_width = 40 + + # Group characters into words for better readability + words = test_text.split() + char_idx = 0 + + for word in words: + word_len = len(word) + # Get attributions for this word + word_attrs = char_attributions[char_idx:char_idx + word_len] + if len(word_attrs) > 0: + avg_attr = sum(word_attrs) / len(word_attrs) + bar_length = int((avg_attr / max_attr) * bar_width) if max_attr > 0 else 0 + bar = "█" * bar_length + print(f"{word:>15} | {bar:<40} {avg_attr:.4f}") + char_idx += word_len + 1 # +1 for space + + print("-" * 60) + + # Show top contributing word + char_idx = 0 + word_scores = [] + for word in words: + word_len = len(word) + word_attrs = char_attributions[char_idx:char_idx + word_len] + if len(word_attrs) > 0: + word_scores.append((word, sum(word_attrs) / len(word_attrs))) + char_idx += word_len + 1 + + if word_scores: + top_word, top_score = max(word_scores, key=lambda x: x[1]) + print(f"💡 Most influential word: '{top_word}' (avg score: {top_score:.4f})") + except Exception as e: print(f"⚠️ Explainability failed: {e}") + import traceback + traceback.print_exc() # Analysis completed for this example print(f"✅ Analysis completed for example {i}") @@ -164,56 +241,72 @@ def main(): continue print(f"\n🔍 Analyzing: '{user_text}'") - - # Get prediction - prediction = classifier.predict(np.array([user_text]))[0] - sentiment = "Positive" if prediction == 1 else "Negative" - print(f"🎯 Prediction: {sentiment}") - - # Get explainability scores + + # Get prediction with explainability try: - pred, confidence, all_scores, all_scores_letters = classifier.predict_and_explain(np.array([user_text])) - - # Create ASCII histogram - if all_scores is not None and len(all_scores) > 0: - scores_data = all_scores[0][0] - if hasattr(scores_data, 'tolist'): - scores = scores_data.tolist() - else: - scores = [float(scores_data)] - - words = user_text.split() - - if len(words) == len(scores): - print("\n📊 Word Contribution Histogram:") - print("-" * 50) - - # Find max score for scaling - max_score = max(scores) if scores else 1 - bar_width = 30 # max bar width in characters - - for word, score in zip(words, scores): - # Calculate bar length - bar_length = int((score / max_score) * bar_width) - bar = "█" * bar_length - - # Format output - print(f"{word:>12} | {bar:<30} {score:.4f}") - - print("-" * 50) - - # Show interpretation - top_word = max(zip(words, scores), key=lambda x: x[1]) - print(f"💡 Most influential word: '{top_word[0]}' (score: {top_word[1]:.4f})") - - else: - print(f"⚠️ Word/score mismatch: {len(words)} words vs {len(scores)} scores") - else: - print("⚠️ No explainability scores available") - + result = classifier.predict(np.array([user_text]), top_k=1, explain=True) + + # Extract prediction + prediction = result["prediction"][0][0].item() + confidence = result["confidence"][0][0].item() + sentiment = "Positive" if prediction == 1 else "Negative" + print(f"🎯 Prediction: {sentiment} (confidence: {confidence:.4f})") + + # Extract attributions and mapping info + attributions = result["attributions"][0][0] # shape: (seq_len,) + offset_mapping = result["offset_mapping"][0] # List of (start, end) tuples + word_ids = result["word_ids"][0] # List of word IDs for each token + + # Map token-level attributions to character-level (for ASCII visualization) + char_attributions = map_attributions_to_char( + attributions.unsqueeze(0), # Add batch dimension: (1, seq_len) + offset_mapping, + user_text + )[0] # Get first result + + print("\n📊 Character-Level Contribution Visualization:") + print("-" * 60) + + # Create a simple ASCII visualization by character + max_attr = max(char_attributions) if len(char_attributions) > 0 else 1 + bar_width = 40 + + # Group characters into words for better readability + words = user_text.split() + char_idx = 0 + + for word in words: + word_len = len(word) + # Get attributions for this word + word_attrs = char_attributions[char_idx:char_idx + word_len] + if len(word_attrs) > 0: + avg_attr = sum(word_attrs) / len(word_attrs) + bar_length = int((avg_attr / max_attr) * bar_width) if max_attr > 0 else 0 + bar = "█" * bar_length + print(f"{word:>15} | {bar:<40} {avg_attr:.4f}") + char_idx += word_len + 1 # +1 for space + + print("-" * 60) + + # Show interpretation + char_idx = 0 + word_scores = [] + for word in words: + word_len = len(word) + word_attrs = char_attributions[char_idx:char_idx + word_len] + if len(word_attrs) > 0: + word_scores.append((word, sum(word_attrs) / len(word_attrs))) + char_idx += word_len + 1 + + if word_scores: + top_word, top_score = max(word_scores, key=lambda x: x[1]) + print(f"💡 Most influential word: '{top_word}' (avg score: {top_score:.4f})") + except Exception as e: print(f"⚠️ Explainability failed: {e}") print("🔍 Prediction available, but detailed explanation unavailable.") + import traceback + traceback.print_exc() print("\n" + "-"*50) diff --git a/examples/using_additional_features.py b/examples/using_additional_features.py index e22796c..854c0e4 100644 --- a/examples/using_additional_features.py +++ b/examples/using_additional_features.py @@ -2,19 +2,26 @@ Categorical Features Comparison Example This example demonstrates the performance difference between: -1. A FastText classifier using only text features -2. A FastText classifier using both text and categorical features +1. A classifier using only text features +2. A classifier using both text and categorical features """ +import os +import random +import time +import warnings + import numpy as np import pandas as pd +import torch +from pytorch_lightning import seed_everything from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder -from torchTextClassifiers import create_fasttext -from torchTextClassifiers.utilities.preprocess import clean_text_feature -from torchTextClassifiers import torchTextClassifiers -from torchTextClassifiers.classifiers.simple_text_classifier import SimpleTextWrapper, SimpleTextConfig -import time + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.tokenizers import WordPieceTokenizer +# Note: SimpleTextWrapper is not available in the current version +# from torchTextClassifiers.classifiers.simple_text_classifier import SimpleTextConfig, SimpleTextWrapper def stratified_split_rare_labels(X, y, test_size=0.2, min_train_samples=1): # Get unique labels and their frequencies @@ -95,131 +102,102 @@ def train_and_evaluate_model(X, y, model_name, use_categorical=False, use_simple X_temp, y_temp, test_size=0.5 # Split temp 50/50 into validation and test ) + # Note: SimpleTextWrapper is not available in the current version + # The use_simple branch has been disabled if use_simple: - start_time = time.time() - - simple_text_config = SimpleTextConfig( - hidden_dim=128, - num_classes=5, - max_features=5000, - learning_rate=1e-3, - dropout_rate=0.2 - ) - wrapper = SimpleTextWrapper(simple_text_config) - classifier = torchTextClassifiers(wrapper) - print(f"Classifier type: {type(classifier.classifier).__name__}") - print(f"Uses tokenizer: {hasattr(classifier.classifier, 'tokenizer')}") - print(f"Uses vectorizer: {hasattr(classifier.classifier, 'vectorizer')}") - - # Build the model (this will use TF-IDF vectorization instead of tokenization) - print("\n🔨 Building model with TF-IDF preprocessing...") - classifier.build(X_train, y_train) - print("✅ Model built successfully!") - print(f"TF-IDF features: {len(classifier.classifier.vectorizer.get_feature_names_out())}") - - # Train the model - print("\n🎯 Training model...") - classifier.train( - X_train, y_train, X_val, y_val, - num_epochs=10, - batch_size=4, - patience_train=3, - verbose=True + raise NotImplementedError( + "SimpleTextWrapper is not available in the current version. " + "Please use the standard torchTextClassifiers with WordPieceTokenizer instead." ) - training_time = time.time() - start_time - accuracy = classifier.validate(X_test, y_test) - print(f"Test accuracy: {accuracy:.3f}") - - return { - 'model_name': model_name, - 'test_accuracy': accuracy, - 'training_time': training_time, - 'classifier': classifier - } - # Model parameters + # Create and train tokenizer + print(" 🏗️ Creating and training tokenizer...") + tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=128) + + # Extract text column for tokenizer training + if use_categorical: + text_data = X_train[:, 0].tolist() # First column is text + else: + text_data = X_train.tolist() # All data is text + + tokenizer.train(text_data) + print(" ✅ Tokenizer trained successfully!") + + # Model configuration if use_categorical: # For mixed model - get vocabulary sizes from data - # cat_data = X_train[:, 1:].astype(int) # Categorical features - # vocab_sizes = [int(np.max(cat_data[:, i]) + 1) for i in range(cat_data.shape[1])] - - model_params = { - "embedding_dim": 50, - "sparse": False, - "num_tokens": 50000, - "min_count": 1, - "min_n": 3, - "max_n": 6, - "len_word_ngrams": 2, - "categorical_embedding_dims": 10, - #"num_categorical_features": num_cat_var, - #"categorical_vocabulary_sizes": vocab_sizes, - #"categorical_embedding_dims": 10 - } - #print(f" Categorical vocabulary sizes: {vocab_sizes}") + cat_data = X_train[:, 1:].astype(int) # Categorical features + vocab_sizes = [int(np.max(cat_data[:, i]) + 1) for i in range(cat_data.shape[1])] + + model_config = ModelConfig( + embedding_dim=50, + categorical_vocabulary_sizes=vocab_sizes, + categorical_embedding_dims=10, + num_classes=5 + ) + print(f" Categorical vocabulary sizes: {vocab_sizes}") else: # For text-only model - model_params = { - "embedding_dim": 50, - "sparse": False, - - "num_tokens": 50000, - "min_count": 1, - "min_n": 3, - "max_n": 6, - "len_word_ngrams": 2 + model_config = ModelConfig( + embedding_dim=50, + num_classes=5 + ) + + # Create classifier + print(" 🔨 Creating classifier...") + classifier = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config + ) + print(" ✅ Classifier created successfully!") + + # Training configuration + training_config = TrainingConfig( + num_epochs=50, + batch_size=128, + lr=0.001, + patience_early_stopping=3, + num_workers=0, + trainer_params={ + 'enable_progress_bar': True, + 'deterministic': True } - - # Training parameters - reduced to save disk space - train_params = { - "num_epochs": 50, - "batch_size": 128, - "patience_train": 3, - "lr": 0.001, - "verbose": True - } + ) - extra_trainer_params = { - "enable_progress_bar": True, - - } - # Create and build model start_time = time.time() - - classifier = create_fasttext(**model_params) - classifier.build(X_train, y_train) - - # Train model - disable logging to save disk space + + # Train model + print(" 🎯 Training model...") classifier.train( - X_train, y_train, X_val, y_val, **train_params, - trainer_params=extra_trainer_params + X_train, y_train, X_val, y_val, + training_config=training_config, + verbose=True ) training_time = time.time() - start_time - + # Handle predictions based on model type if use_categorical: - # Skip validation for mixed model due to categorical prediction bug print(" ✅ Running validation for text-with-categorical-variables model...") try: - test_accuracy = classifier.validate(X_test, y_test) - predictions = classifier.predict(X_test) + result = classifier.predict(X_test) + predictions = result["prediction"].squeeze().numpy() + test_accuracy = (predictions == y_test).mean() print(f" Test accuracy: {test_accuracy:.3f}") except Exception as e: print(f" ⚠️ Validation failed: {e}") - train_accuracy = 0.0 test_accuracy = 0.0 predictions = np.zeros(len(y_test)) else: # Text-only model works fine for predictions print(" ✅ Running validation for text-only model...") try: - test_accuracy = classifier.validate(X_test, y_test) - predictions = classifier.predict(X_test) + result = classifier.predict(X_test) + predictions = result["prediction"].squeeze().numpy() + test_accuracy = (predictions == y_test).mean() print(f" Test accuracy: {test_accuracy:.3f}") except Exception as e: print(f" ⚠️ Validation failed: {e}") - train_accuracy = 0.0 test_accuracy = 0.0 predictions = np.zeros(len(y_test)) @@ -236,11 +214,34 @@ def train_and_evaluate_model(X, y, model_name, use_categorical=False, use_simple def main(): - print("🔀 FastText Classifier: Categorical Features Comparison") + # Set seed for reproducibility + SEED = 42 + + # Set environment variables for full reproducibility + os.environ['PYTHONHASHSEED'] = str(SEED) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + + # Use PyTorch Lightning's seed_everything for comprehensive seeding + seed_everything(SEED, workers=True) + + # Make PyTorch operations deterministic + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.use_deterministic_algorithms(True, warn_only=True) + + # Suppress PyTorch Lightning warnings for cleaner output + warnings.filterwarnings( + 'ignore', + message='.*', + category=UserWarning, + module='pytorch_lightning' + ) + + print("🔀 Classifier: Categorical Features Comparison") print("=" * 60) - print("Comparing FastText performance with and without categorical features") + print("Comparing performance with and without categorical features") print() - + # Load and prepare data (same as notebook) X_text_only, X_mixed, y, encoder = load_and_prepare_data() @@ -250,17 +251,17 @@ def main(): # Text-only model results_text_only = train_and_evaluate_model( - X_text_only, y, "Text-Only FastText", use_categorical=False + X_text_only, y, "Text-Only Classifier", use_categorical=False ) - + # Mixed model (text + categorical) results_mixed = train_and_evaluate_model( - X_mixed, y, "Mixed Features FastText", use_categorical=True + X_mixed, y, "Mixed Features Classifier", use_categorical=True ) - # TF-IDF classifier - results_tfidf = train_and_evaluate_model(X_text_only, y, "TF-IDF classifier", use_categorical=False, use_simple=True) - + # Note: TF-IDF classifier (SimpleTextWrapper) is not available in the current version + # results_tfidf = train_and_evaluate_model(X_text_only, y, "TF-IDF classifier", use_categorical=False, use_simple=True) + # Compare results print(f"\n📊 Results Comparison:") print("=" * 50) @@ -270,8 +271,6 @@ def main(): f"{results_text_only['test_accuracy']:<11.3f} {results_text_only['training_time']:<10.1f}") print(f"{'Mixed Features':<25} " f"{results_mixed['test_accuracy']:<11.3f} {results_mixed['training_time']:<10.1f}") - print(f"{'TF-IDF':<25} " - f"{results_tfidf['test_accuracy']:<11.3f} {results_tfidf['training_time']:<10.1f}") # Calculate improvements acc_improvement = results_mixed['test_accuracy'] - results_text_only['test_accuracy'] time_overhead = results_mixed['training_time'] - results_text_only['training_time'] diff --git a/notebooks/example.ipynb b/notebooks/example.ipynb index 02c41b1..cd32c28 100644 --- a/notebooks/example.ipynb +++ b/notebooks/example.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "a01b1526-51df-4bf9-9fd4-11ef22ffcc79", + "id": "0", "metadata": {}, "source": [ "# Example usage of the `torchTextClassifiers` library\n", @@ -13,488 +13,53 @@ "regular look to for\n", "latest information.*\n", "\n", - "To install package, you can run the following snippet" + "To download the latest (development) version of the library, you can use\n", + "```bash\n", + "uv add git+https://github.com/InseeFrLab/torchTextClassifiers\n", + "```\n", + "or, if you prefer using `pip`:\n", + "```bash\n", + "pip install git git+https://github.com/InseeFrLab/torchTextClassifiers\n", + "```\n" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "a00a2856", + "execution_count": null, + "id": "1", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2mUsing Python 3.13.5 environment at: /opt/python\u001b[0m\n", - "\u001b[2K\u001b[2mResolved \u001b[1m42 packages\u001b[0m \u001b[2min 70ms\u001b[0m\u001b[0m \u001b[0m\n", - "\u001b[2K \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m torchtextclassifiers\u001b[2m @ file:///home/onyxia/work/torchTextClassifiers\u001b[0m\n", - "\u001b[2K\u001b[1A \u001b[32m\u001b[1mBuilt\u001b[0m\u001b[39m torchtextclassifiers\u001b[2m @ file:///home/onyxia/work/torchTextClassifiers\u001b[0m\n", - "\u001b[2K\u001b[2mPrepared \u001b[1m1 package\u001b[0m \u001b[2min 28ms\u001b[0m\u001b[0m \n", - "\u001b[2mUninstalled \u001b[1m1 package\u001b[0m \u001b[2min 1ms\u001b[0m\u001b[0m\n", - "\u001b[2K░░░░░░░░░░░░░░░░░░░░ [0/1] \u001b[2mInstalling wheels... \u001b[0m\u001b[1m\u001b[33mwarning\u001b[39m\u001b[0m\u001b[1m:\u001b[0m \u001b[1mFailed to hardlink files; falling back to full copy. This may lead to degraded performance.\n", - " If the cache and target directories are on different filesystems, hardlinking may not be supported.\n", - " If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning.\u001b[0m\n", - "\u001b[2K\u001b[2mInstalled \u001b[1m1 package\u001b[0m \u001b[2min 3ms\u001b[0m\u001b[0ms==0.0.0.dev0 (from file:///home/o\u001b[0m\n", - " \u001b[33m~\u001b[39m \u001b[1mtorchtextclassifiers\u001b[0m\u001b[2m==0.0.0.dev0 (from file:///home/onyxia/work/torchTextClassifiers)\u001b[0m\n", - "Note: you may need to restart the kernel to use updated packages.\n", - "\u001b[2mUsing Python 3.13.5 environment at: /opt/python\u001b[0m\n", - "\u001b[2K\u001b[2mResolved \u001b[1m43 packages\u001b[0m \u001b[2min 229ms\u001b[0m\u001b[0m \u001b[0m\n", - "\u001b[2K\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2) \n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 0 B/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 14.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 30.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 46.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 62.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 78.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 94.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 110.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 126.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 142.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 158.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 174.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 190.91 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 206.20 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 222.20 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 238.20 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/2)--------------\u001b[0m\u001b[0m 254.20 KiB/1.34 MiB \u001b[1A\n", - "\u001b[2K \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[1A \u001b[32m\u001b[1mBuilt\u001b[0m\u001b[39m numpy\u001b[2m==1.26.4\u001b[0m \u001b[1A\n", - "\u001b[2K\u001b[2mPrepared \u001b[1m2 packages\u001b[0m \u001b[2min 1m 24s\u001b[0m\u001b[0m \n", - "\u001b[2mUninstalled \u001b[1m1 package\u001b[0m \u001b[2min 59ms\u001b[0m\u001b[0m\n", - "\u001b[2K░░░░░░░░░░░░░░░░░░░░ [0/3] \u001b[2mInstalling wheels... \u001b[0m\u001b[1m\u001b[33mwarning\u001b[39m\u001b[0m\u001b[1m:\u001b[0m \u001b[1mFailed to hardlink files; falling back to full copy. This may lead to degraded performance.\n", - " If the cache and target directories are on different filesystems, hardlinking may not be supported.\n", - " If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning.\u001b[0m\n", - "\u001b[2K\u001b[2mInstalled \u001b[1m3 packages\u001b[0m \u001b[2min 6.98s\u001b[0m\u001b[0m \u001b[0m\n", - " \u001b[32m+\u001b[39m \u001b[1mcaptum\u001b[0m\u001b[2m==0.8.0\u001b[0m\n", - " \u001b[31m-\u001b[39m \u001b[1mnumpy\u001b[0m\u001b[2m==2.3.1\u001b[0m\n", - " \u001b[32m+\u001b[39m \u001b[1mnumpy\u001b[0m\u001b[2m==1.26.4\u001b[0m\n", - " \u001b[32m+\u001b[39m \u001b[1munidecode\u001b[0m\u001b[2m==1.4.0\u001b[0m\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], + "outputs": [], "source": [ - "# Stable version\n", - "%uv pip install --system .. \n", - "%uv pip install --system captum unidecode nltk\n" + "import torch\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "from notebooks.utils import categorize_surface, clean_and_tokenize_df\n", + "from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers\n", + "from torchTextClassifiers.dataset import TextClassificationDataset\n", + "from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule\n", + "from torchTextClassifiers.model.components import (\n", + " AttentionConfig,\n", + " CategoricalVariableNet,\n", + " ClassificationHead,\n", + " TextEmbedder,\n", + " TextEmbedderConfig,\n", + ")\n", + "from torchTextClassifiers.tokenizers import HuggingFaceTokenizer, WordPieceTokenizer\n", + "from torchTextClassifiers.utilities.plot_explainability import (\n", + " map_attributions_to_char,\n", + " map_attributions_to_word,\n", + " plot_attributions_at_char,\n", + " plot_attributions_at_word,\n", + ")\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" ] }, { "cell_type": "markdown", - "id": "b292ea76-57a1-4d4e-9bde-dcc9656dc447", + "id": "2", "metadata": {}, "source": [ "# Load and preprocess data\n", @@ -505,243 +70,52 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "37c042fe", + "execution_count": null, + "id": "3", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", - "df = pd.read_parquet(\"https://minio.lab.sspcloud.fr/projet-ape/extractions/20241027_sirene4.parquet\")\n", - "df = df.sample(10000)" + "\n", + "df = pd.read_parquet(\"https://minio.lab.sspcloud.fr/projet-ape/data/08112022_27102024/naf2008/split/df_train.parquet\")\n", + "df" ] }, { "cell_type": "markdown", - "id": "c399b4b0-a9cb-450e-9a5e-480e0e657b8e", + "id": "4", "metadata": {}, "source": [ - "Our goal will be to build multilabel classification for the `code`\n", - "variable using `libelle` as feature.\n", + "It contains an extract from the French business register (SIRENE). The `apet_finale` column contains the activity code - out target - in the French version of the [NACE nomenclature](https://ec.europa.eu/eurostat/web/products-manuals-and-guidelines/-/ks-ra-07-015). The text to classify is the `libelle` column, which contains a short description of the activity.\n", "\n", - "## Enriching our test dataset\n", - "\n", - "Unlike `Fasttext`, this package offers the possibility of having several\n", - "feature columns of different types (string for the text column and\n", - "additional variables in numeric form, for example). To illustrate that,\n", - "we propose the following enrichment of the example dataset:" + "Other columns are additional, **categorical** features that will be also used as inputs to the model.\n" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "92402df7", + "execution_count": null, + "id": "5", "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import LabelEncoder\n", - "\n", - "def categorize_surface(\n", - " df: pd.DataFrame, surface_feature_name: int, like_sirene_3: bool = True\n", - ") -> pd.DataFrame:\n", - " \"\"\"\n", - " Categorize the surface of the activity.\n", - "\n", - " Args:\n", - " df (pd.DataFrame): DataFrame to categorize.\n", - " surface_feature_name (str): Name of the surface feature.\n", - " like_sirene_3 (bool): If True, categorize like Sirene 3.\n", - "\n", - " Returns:\n", - " pd.DataFrame: DataFrame with a new column \"surf_cat\".\n", - " \"\"\"\n", - " df_copy = df.copy()\n", - " df_copy[surface_feature_name] = df_copy[surface_feature_name].replace(\"nan\", np.nan)\n", - " df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(float)\n", - " # Check surface feature exists\n", - " if surface_feature_name not in df.columns:\n", - " raise ValueError(f\"Surface feature {surface_feature_name} not found in DataFrame.\")\n", - " # Check surface feature is a float variable\n", - " if not (pd.api.types.is_float_dtype(df_copy[surface_feature_name])):\n", - " raise ValueError(f\"Surface feature {surface_feature_name} must be a float variable.\")\n", - "\n", - " if like_sirene_3:\n", - " # Categorize the surface\n", - " df_copy[\"surf_cat\"] = pd.cut(\n", - " df_copy[surface_feature_name],\n", - " bins=[0, 120, 400, 2500, np.inf],\n", - " labels=[\"1\", \"2\", \"3\", \"4\"],\n", - " ).astype(str)\n", - " else:\n", - " # Log transform the surface\n", - " df_copy[\"surf_log\"] = np.log(df[surface_feature_name])\n", - "\n", - " # Categorize the surface\n", - " df_copy[\"surf_cat\"] = pd.cut(\n", - " df_copy.surf_log,\n", - " bins=[0, 3, 4, 5, 12],\n", - " labels=[\"1\", \"2\", \"3\", \"4\"],\n", - " ).astype(str)\n", - "\n", - " df_copy[surface_feature_name] = df_copy[\"surf_cat\"].replace(\"nan\", \"0\")\n", - " df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(int)\n", - " df_copy = df_copy.drop(columns=[\"surf_log\", \"surf_cat\"], errors=\"ignore\")\n", - " return df_copy\n", - "\n", - "\n", - "def clean_and_tokenize_df(\n", - " df,\n", - " categorical_features=[\"EVT\", \"CJ\", \"NAT\", \"TYP\", \"CRT\"],\n", - " text_feature=\"libelle_processed\",\n", - " label_col=\"apet_finale\",\n", - "):\n", - " df.fillna(\"nan\", inplace=True)\n", - "\n", - " df = df.rename(\n", - " columns={\n", - " \"evenement_type\": \"EVT\",\n", - " \"cj\": \"CJ\",\n", - " \"activ_nat_et\": \"NAT\",\n", - " \"liasse_type\": \"TYP\",\n", - " \"activ_surf_et\": \"SRF\",\n", - " \"activ_perm_et\": \"CRT\",\n", - " }\n", - " )\n", - "\n", - " les = []\n", - " for col in categorical_features:\n", - " le = LabelEncoder()\n", - " df[col] = le.fit_transform(df[col])\n", - " les.append(le)\n", - "\n", - " df = categorize_surface(df, \"SRF\", like_sirene_3=True)\n", - " df = df[[text_feature, \"EVT\", \"CJ\", \"NAT\", \"TYP\", \"SRF\", \"CRT\", label_col]]\n", - "\n", - " return df, les\n", - "\n", - "\n", - "def stratified_split_rare_labels(X, y, test_size=0.2, min_train_samples=1):\n", - " # Get unique labels and their frequencies\n", - " unique_labels, label_counts = np.unique(y, return_counts=True)\n", - "\n", - " # Separate rare and common labels\n", - " rare_labels = unique_labels[label_counts == 1]\n", - "\n", - " # Create initial mask for rare labels to go into training set\n", - " rare_label_mask = np.isin(y, rare_labels)\n", - "\n", - " # Separate data into rare and common label datasets\n", - " X_rare = X[rare_label_mask]\n", - " y_rare = y[rare_label_mask]\n", - " X_common = X[~rare_label_mask]\n", - " y_common = y[~rare_label_mask]\n", - "\n", - " # Split common labels stratified\n", - " X_common_train, X_common_test, y_common_train, y_common_test = train_test_split(\n", - " X_common, y_common, test_size=test_size, stratify=y_common\n", - " )\n", - "\n", - " # Combine rare labels with common labels split\n", - " X_train = np.concatenate([X_rare, X_common_train])\n", - " y_train = np.concatenate([y_rare, y_common_train])\n", - " X_test = X_common_test\n", - " y_test = y_common_test\n", - "\n", - " return X_train, X_test, y_train, y_test\n", - "\n", - "def add_libelles(\n", - " df: pd.DataFrame,\n", - " df_naf: pd.DataFrame,\n", - " y: str,\n", - " text_feature: str,\n", - " textual_features: list,\n", - " categorical_features: list,\n", - "):\n", - " missing_codes = set(df_naf[\"code\"])\n", - " fake_obs = df_naf[df_naf[\"code\"].isin(missing_codes)]\n", - " fake_obs[y] = fake_obs[\"code\"]\n", - " fake_obs[text_feature] = fake_obs[[text_feature]].apply(\n", - " lambda row: \" \".join(f\"[{col}] {val}\" for col, val in row.items() if val != \"\"), axis=1\n", - " )\n", - " df = pd.concat([df, fake_obs[[col for col in fake_obs.columns if col in df.columns]]])\n", - "\n", - " if textual_features is not None:\n", - " for feature in textual_features:\n", - " df[feature] = df[feature].fillna(value=\"\")\n", - " if categorical_features is not None:\n", - " for feature in categorical_features:\n", - " df[feature] = df[feature].fillna(value=\"NaN\")\n", - "\n", - " print(f\"\\t*** {len(missing_codes)} codes have been added in the database...\\n\")\n", - " return df" + "df" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "1fd02895", + "execution_count": null, + "id": "6", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\t*** 732 codes have been added in the database...\n", - "\n" - ] - } - ], + "outputs": [], "source": [ - "categorical_features = [\"evenement_type\", \"cj\", \"activ_nat_et\", \"liasse_type\", \"activ_surf_et\", \"activ_perm_et\"]\n", + "categorical_features = [ \"CJ\", \"NAT\", \"TYP\", \"SRF\", \"CRT\"]\n", "text_feature = \"libelle\"\n", - "y = \"apet_finale\"\n", - "textual_features = None\n", - "\n", - "naf2008 = pd.read_csv(\"https://minio.lab.sspcloud.fr/projet-ape/data/naf2008.csv\", sep=\";\")\n", - "df = add_libelles(df, naf2008, y, text_feature, textual_features, categorical_features)" - ] - }, - { - "cell_type": "markdown", - "id": "67f4160d-0c98-4700-80f4-1ba454e6a2df", - "metadata": {}, - "source": [ - "## Preprocessing\n", - "\n", - "To reduce noise in text fields, we recommend pre-processing before\n", - "training a model with our package. We assume this preprocessing is\n", - "handled by the package user : this gives him the opportunity to control\n", - "data cleansing.\n", - "\n", - "Here’s an example of the type of preprocessing that can be carried out\n", - "before moving on to the modeling phase" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "61b0252e", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/python/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "from torchTextClassifiers.utilities.preprocess import clean_text_feature\n", - "df[\"libelle_processed\"] = clean_text_feature(df[\"libelle\"])" + "y = \"apet_finale\"" ] }, { "cell_type": "markdown", - "id": "acde2929-fe92-4107-8066-a5c8ac5d6428", + "id": "7", "metadata": {}, "source": [ "Right now, the model requires the label (variable y) to be a numerical\n", @@ -755,8 +129,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "8c02a833", + "execution_count": null, + "id": "8", "metadata": {}, "outputs": [], "source": [ @@ -766,1076 +140,662 @@ }, { "cell_type": "markdown", - "id": "25593e1a-1661-49e3-9734-272ec4745de1", + "id": "9", "metadata": {}, "source": [ "The function `clean_and_tokenize_df` requires special `DataFrame`\n", "formatting:\n", "\n", "- First column contains the processed text (str)\n", - "- Next ones contain the “encoded” categorical (discrete) variables in\n", - " int format" + "- Next ones contain the **“encoded”** categorical (discrete) variables in\n", + " **int format**, as required by torchTextClassifiers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10", + "metadata": {}, + "outputs": [], + "source": [ + "df, _ = clean_and_tokenize_df(df, text_feature=\"libelle\")\n", + "X = df[[\"libelle\", \"CJ\", \"NAT\", \"TYP\", \"CRT\", \"SRF\"]].values\n", + "y = df[\"apet_finale\"].values\n", + "\n", + "df" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "5fb5b0c7", + "execution_count": null, + "id": "11", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_159490/2075507147.py:60: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.\n", - " df.fillna(\"nan\", inplace=True)\n" - ] - } - ], + "outputs": [], "source": [ - "df, _ = clean_and_tokenize_df(df, text_feature=\"libelle_processed\")\n", - "X = df[[\"libelle_processed\", \"EVT\", \"CJ\", \"NAT\", \"TYP\", \"CRT\", \"SRF\"]].values\n", - "y = df[\"apet_finale\"].values" + "X.shape, y.shape" ] }, { "cell_type": "markdown", - "id": "e70de831-dbc9-49be-b0c4-d70dd6479d03", + "id": "12", "metadata": {}, "source": [ - "## Splitting in train-test sets\n", - "\n", - "As usual in a learning approach, you need to break down your data into\n", - "learning and test/validation samples to obtain robust performance\n", - "statistics.\n", - "This work is the responsibility of the package’s users. Please make sure that np.max(y_train) == len(np.unique(y_train))-1 (i.e. your labels are well encoded, in a consecutive manner, starting from 0), and that all the possible labels appear at least once in the training set.\n", - "\n", - "We provide the function stratified_train_test_split to match these requirements here.." + "We now split the data into train, val and test sets, as done classically in machine learning tasks.\n" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "b593fd75", + "execution_count": null, + "id": "13", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "True\n" - ] - } - ], + "outputs": [], "source": [ - "from sklearn.model_selection import train_test_split \n", - "X_train, X_test, y_train, y_test = stratified_split_rare_labels(X, y)\n", + "from sklearn.model_selection import train_test_split\n", "\n", - "print(np.max(y_train) == len(np.unique(y_train))-1)" + "X_train, X_test, y_train, y_test = train_test_split(X, y)" ] }, { "cell_type": "markdown", - "id": "8729c5f4-9038-4437-929b-fc500dc0db7a", + "id": "14", "metadata": {}, "source": [ - "# Build the torchTextClassifiers FastText model (without training it)\n", - "\n", - "There are several ways to define and train a FastText model in\n", - "this package.\n", - "\n", - "We first show how to initialize the model and then afterwards build it.\n", - "\n", - "`create_fasttext` function accepts the following parameters:\n", - "\n", - "| Parameter | Meaning | Example Value |\n", - "|---------------------|------------------------------------------|----------|\n", - "| `num_tokens` | Number of rows in the embedding matrix (size of the vocabulary) | 100000 |\n", - "| `embedding_dim` | Dimension of the embedding (number of columns in the matrix) | 50 |\n", - "| `sparse` | Use sparse embedding for fast computation (PyTorch) | False |\n", - "| `categorical_embedding_dims` | Dimension of the embedding for categorical features | 10 |\n", - "| `min_count` | Minimum occurrences of a word in the corpus to be included | 1 |\n", - "| `min_n` | Minimum length of character n-grams | 3 |\n", - "| `max_n` | Maximum length of character n-grams | 6 |\n", - "| `len_word_ngrams` | Length of word n-grams | 3 |" + "# Let's dive into the different components of a text classification model" + ] + }, + { + "cell_type": "markdown", + "id": "15", + "metadata": {}, + "source": [ + "## Tokenizer" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "5879ca88", + "execution_count": null, + "id": "16", "metadata": {}, "outputs": [], "source": [ - "from torchTextClassifiers import create_fasttext\n", - "\n", - "parameters = {\n", - " \"num_tokens\": 100000,\n", - " \"embedding_dim\": 50,\n", - " \"sparse\": False,\n", - " \"categorical_embedding_dims\": 10,\n", - " \"min_count\": 1,\n", - " \"min_n\": 3,\n", - " \"max_n\": 6,\n", - " \"len_word_ngrams\": 3,\n", - "}\n", - "\n", - "parameters_train = {\n", - " \"lr\": 0.004,\n", - " \"num_epochs\": 1,\n", - " \"batch_size\": 256,\n", - " \"patience\": 3 \n", - "}\n", - "\n", - "model = create_fasttext(**parameters)" + "text = X_train[:, 0].tolist() # extract the text column as a list" ] }, { "cell_type": "markdown", - "id": "05f9d26b-f08f-41be-93e4-b55a2c86690c", + "id": "17", "metadata": {}, "source": [ - "`model` is then a special `torchTextClassifiers` object:" + "You can directly load a pretrained tokenizer from Hugging Face. But you won't have control over its vocabulary size or other parameters!" ] }, { "cell_type": "code", "execution_count": null, - "id": "ebf5608b", + "id": "18", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torchTextClassifiers.classifiers.fasttext.wrapper.FastTextWrapper" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "type(model.classifier)" + "tokenizer = HuggingFaceTokenizer.load_from_pretrained(\"google-bert/bert-base-uncased\")\n", + "print(\"This tokenizer outputs tensors of size \", tokenizer.tokenize(text[0]).input_ids.shape)\n", + "print(\"The tokens are here \", tokenizer.tokenizer.convert_ids_to_tokens(tokenizer.tokenize(text[0]).input_ids.squeeze(0)))\n", + "print(\"The total number of tokens is \", tokenizer.vocab_size)" ] }, { "cell_type": "markdown", - "id": "dcbe8289-f506-48f9-b854-96f25974368f", + "id": "19", "metadata": {}, "source": [ - "As any `PyTorch` model, it accepts being save as a JSON for later on\n", - "use:" + "Or you can train your own tokenizer from scratch." ] }, { "cell_type": "code", "execution_count": null, - "id": "6c3b2b85", + "id": "20", "metadata": {}, "outputs": [], "source": [ - "model.to_json('torchTextClassifiers_config.json')\n", - "# Loading from JSON now works with the new API:\n", - "# from torchTextClassifiers import torchTextClassifiers\n", - "# loaded_model = torchTextClassifiers.from_json('torchTextClassifiers_config.json')" + "tokenizer = WordPieceTokenizer(vocab_size=5000, output_dim=125)\n", + "tokenizer.train(text)\n", + "print(\"This tokenizer outputs tensors of size \", tokenizer.tokenize(text[0]).input_ids.shape)\n", + "print(\"The tokens are here \", tokenizer.tokenizer.convert_ids_to_tokens(tokenizer.tokenize(text[0]).input_ids.squeeze(0)))\n", + "print(\"The total number of tokens is \", tokenizer.vocab_size)" ] }, { "cell_type": "markdown", - "id": "5f8b017f-66a1-413d-85e8-1981adf64823", + "id": "21", "metadata": {}, "source": [ - "We can apply `build` to finally train our model. These are the\n", - "parameters accepted by the `build` method\n", - "\n", - "| Parameter | Meaning | Example Value |\n", - "|---------------------|------------------------------------------|----------|\n", - "| `lr` | Learning rate | 0.004 |\n", - "| `num_epochs` | Number of training epochs | 1 |\n", - "| `batch_size` | Batch size for training | 256 |\n", - "| `patience` | Early stopping patience (number of epochs without improvement) | 3 |\n", - "\n", - "We build the model using the training data. We have now access to the\n", - "tokenizer, the PyTorch model as well as a PyTorch Lightning module ready\n", - "to be trained. Note that Lightning is high-level framework for PyTorch\n", - "that simplifies the process of training, validating, and deploying\n", - "machine learning models." + "## The PyTorch Dataset" + ] + }, + { + "cell_type": "markdown", + "id": "22", + "metadata": {}, + "source": [ + "To train a text classification model using PyTorch, you need to create a Dataset object that will handle the data loading and preprocessing. The `TextClassificationDataset` class from the `torchTextClassifiers` library can be used for this purpose." ] }, { "cell_type": "code", - "execution_count": 11, - "id": "e2e43d0e", + "execution_count": null, + "id": "23", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-07-23 12:36:22 - torchTextClassifiers.classifiers.fasttext.model - num_rows is different from the number of tokens in the tokenizer. Using provided num_rows.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-07-23 12:36:22 - root - No scheduler parameters provided. Using defaults.\n" - ] - } - ], + "outputs": [], "source": [ - "model.build(X_train, y_train, lightning=True, lr=parameters_train.get(\"lr\"))" + "train_dataset = TextClassificationDataset(\n", + " texts=X_train[:, 0].tolist(),\n", + " categorical_variables=X_train[:, 1:].tolist(),\n", + " tokenizer=tokenizer,\n", + " labels=y_train,\n", + ")\n", + "train_dataset[0]" ] }, { "cell_type": "markdown", - "id": "b5a7d5fa-596a-470b-892e-e8fafdb8221a", + "id": "24", "metadata": {}, "source": [ - "One can retrieve different objects from `model` instance:\n", - "\n", - "- `model.pytorch_model`\n", - "- `model.tokenizer`\n", - "- `model.lightning_module`" + "And then, you can create a ``DataLoader`` to iterate over the dataset during training. The DataLoader, via its `collate_fn` function, handles nicely the raw text and outputs tokenized, padded PyTorch tensors for immediate model ingestion." ] }, { "cell_type": "code", "execution_count": null, - "id": "091024e6", + "id": "25", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "FastTextModel(\n", - " (embeddings): EmbeddingBag(108297, 50, mode='mean', padding_idx=108296)\n", - " (emb_0): Embedding(24, 10)\n", - " (emb_1): Embedding(42, 10)\n", - " (emb_2): Embedding(10, 10)\n", - " (emb_3): Embedding(13, 10)\n", - " (emb_4): Embedding(3, 10)\n", - " (emb_5): Embedding(5, 10)\n", - " (fc): Linear(in_features=60, out_features=732, bias=True)\n", - ")" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "model.classifier.pytorch_model" + "train_dataloader = train_dataset.create_dataloader(\n", + " batch_size=256,\n", + " num_workers=12,\n", + " shuffle=False,\n", + " )\n", + "batch = next(iter(train_dataloader))\n", + "\n", + "print(\"Input IDs shape: \", batch[\"input_ids\"].shape) # (batch_size, tokenizer.output_dim (=seq_length))" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "d983b113", + "cell_type": "markdown", + "id": "26", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "model.classifier.tokenizer" + "## The PyTorch Model" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "9b23f1ba", + "cell_type": "markdown", + "id": "27", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "FastTextModule(\n", - " (model): FastTextModel(\n", - " (embeddings): EmbeddingBag(108297, 50, mode='mean', padding_idx=108296)\n", - " (emb_0): Embedding(24, 10)\n", - " (emb_1): Embedding(42, 10)\n", - " (emb_2): Embedding(10, 10)\n", - " (emb_3): Embedding(13, 10)\n", - " (emb_4): Embedding(3, 10)\n", - " (emb_5): Embedding(5, 10)\n", - " (fc): Linear(in_features=60, out_features=732, bias=True)\n", - " )\n", - " (loss): CrossEntropyLoss()\n", - " (accuracy_fn): MulticlassAccuracy()\n", - ")" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "model.classifier.lightning_module" + "We provide three main components for the model architecture: \n", + "\n", + "- The `TextEmbedder` class, which handles the embedding of the token ids input into dense vectors\n", + "- The `CategoricalVariableNet` class, which handles the embedding of the categorical variables\n", + "- The `ClassificationHead`, that outputs the prediction vector\n", + "\n", + "Eventually, the `TextClassificationModel` class combines all these components into a single model that can be trained end-to-end.\n", + "\n", + "All of these four objects inherit from the `torch.nn.Module` class, so you can use them as you would do with any PyTorch model." ] }, { "cell_type": "markdown", - "id": "b804391a-979a-4a74-a5f7-d8e27550e20e", + "id": "28", "metadata": {}, "source": [ - "One can also retrieve more precise information regarding the tokenizer.\n", - "This can be useful to know how text is parsed before being given to the\n", - "neural network:" + "### The TextEmbedder" ] }, { "cell_type": "code", "execution_count": null, - "id": "00c077b0", + "id": "29", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{0: '',\n", - " 369: 'sit',\n", - " 8402: 'lorem ipsum dolor',\n", - " 8477: '',\n", - " 9254: '',\n", - " 17674: 'ame',\n", - " 19233: '',\n", - " 21660: '',\n", - " 33686: 'or>',\n", - " 36146: 'ipsum dolor',\n", - " 37685: '',\n", - " 44699: '',\n", - " 46043: 'sit>',\n", - " 46176: 'ipsu',\n", - " 49083: 'psu',\n", - " 49236: 'orem>',\n", - " 50091: '',\n", - " 57650: 'it>',\n", - " 58295: 'olor>',\n", - " 60820: 'lor>',\n", - " 60827: 'ore',\n", - " 63114: 'sum>',\n", - " 65777: 'met',\n", - " 65864: '',\n", - " 68083: 'olor',\n", - " 68290: 'orem',\n", - " 68834: 'psum',\n", - " 72463: 'rem',\n", - " 74123: 'ipsum>',\n", - " 74942: 'dolor sit',\n", - " 76898: 'lor',\n", - " 77899: '',\n", - " 87932: '',\n", - " 93076: 'psum>',\n", - " 93114: ' same output as ClassificationHead (logits)\n", + "model(input_ids=batch[\"input_ids\"], attention_mask=batch[\"attention_mask\"], categorical_vars=batch[\"categorical_vars\"]).shape" ] }, { - "cell_type": "code", - "execution_count": 22, - "id": "0b4964f3", + "cell_type": "markdown", + "id": "44", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "([['', '', 'H '],\n", - " ['', '', 'e '],\n", - " ['', '', 'l '],\n", - " ['', '', 'l '],\n", - " ['', '', 'o '],\n", - " [''],\n", - " ['', '', 'w '],\n", - " ['', '', 'o '],\n", - " ['', '', 'r '],\n", - " ['', '', 'l '],\n", - " ['', '', 'd ']],\n", - " [tensor([40876, 0, 51965]),\n", - " tensor([51907, 0, 77296]),\n", - " tensor([74312, 0, 26137]),\n", - " tensor([74312, 0, 26137]),\n", - " tensor([ 9853, 0, 53786]),\n", - " tensor([0]),\n", - " tensor([29925, 0, 74978]),\n", - " tensor([ 9853, 0, 53786]),\n", - " tensor([ 8646, 0, 13223]),\n", - " tensor([74312, 0, 26137]),\n", - " tensor([ 89472, 0, 104945])],\n", - " [{40876: '', 0: '', 51965: 'H '},\n", - " {51907: '', 0: '', 77296: 'e '},\n", - " {74312: '', 0: '', 26137: 'l '},\n", - " {74312: '', 0: '', 26137: 'l '},\n", - " {9853: '', 0: '', 53786: 'o '},\n", - " {0: ''},\n", - " {29925: '', 0: '', 74978: 'w '},\n", - " {9853: '', 0: '', 53786: 'o '},\n", - " {8646: '', 0: '', 13223: 'r '},\n", - " {74312: '', 0: '', 26137: 'l '},\n", - " {89472: '', 0: '', 104945: 'd '}],\n", - " [{'': 40876, '': 0, 'H ': 51965},\n", - " {'': 51907, '': 0, 'e ': 77296},\n", - " {'': 74312, '': 0, 'l ': 26137},\n", - " {'': 74312, '': 0, 'l ': 26137},\n", - " {'': 9853, '': 0, 'o ': 53786},\n", - " {'': 0},\n", - " {'': 29925, '': 0, 'w ': 74978},\n", - " {'': 9853, '': 0, 'o ': 53786},\n", - " {'': 8646, '': 0, 'r ': 13223},\n", - " {'': 74312, '': 0, 'l ': 26137},\n", - " {'': 89472, '': 0, 'd ': 104945}])" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ - "tokenizer.tokenize(\"Hello world\")" + "### The TextClassificationModule" ] }, { "cell_type": "markdown", - "id": "fd5b6899-7831-40a6-9841-bbc1b0804956", + "id": "45", "metadata": {}, "source": [ - "However, there is a more straightforward way to do: creating directly\n", - "the `NGramTokenizer` instance:" + "We provide a PyTorch Lightning wrapper, for easy training and checkpointing. We refer to [Lightning's doc](https://lightning.ai/docs/pytorch/stable/) for more details on how to use it." ] }, { "cell_type": "code", "execution_count": null, - "id": "8a6ee96b", + "id": "46", "metadata": {}, "outputs": [], "source": [ - "tokenizer = NGramTokenizer(\n", - " **parameters,\n", - " training_text=training_text\n", - " )" + "import torch\n", + "\n", + "module = TextClassificationModule(\n", + " model=model,\n", + " loss=torch.nn.CrossEntropyLoss(),\n", + " optimizer=torch.optim.Adam,\n", + " optimizer_params={\"lr\": 1e-3},\n", + " scheduler=None,\n", + " scheduler_params=None,\n", + " scheduler_interval=\"epoch\",\n", + ")\n", + "module" ] }, { - "cell_type": "code", - "execution_count": 24, - "id": "776636e6", + "cell_type": "markdown", + "id": "47", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "([['', '', 'H '],\n", - " ['', '', 'e '],\n", - " ['', '', 'l '],\n", - " ['', '', 'l '],\n", - " ['', '', 'o '],\n", - " [''],\n", - " ['', '', 'w '],\n", - " ['', '', 'o '],\n", - " ['', '', 'r '],\n", - " ['', '', 'l '],\n", - " ['', '', 'd ']],\n", - " [tensor([40876, 0, 51965]),\n", - " tensor([51907, 0, 77296]),\n", - " tensor([74312, 0, 26137]),\n", - " tensor([74312, 0, 26137]),\n", - " tensor([ 9853, 0, 53786]),\n", - " tensor([0]),\n", - " tensor([29925, 0, 74978]),\n", - " tensor([ 9853, 0, 53786]),\n", - " tensor([ 8646, 0, 13223]),\n", - " tensor([74312, 0, 26137]),\n", - " tensor([ 89472, 0, 104945])],\n", - " [{40876: '', 0: '', 51965: 'H '},\n", - " {51907: '', 0: '', 77296: 'e '},\n", - " {74312: '', 0: '', 26137: 'l '},\n", - " {74312: '', 0: '', 26137: 'l '},\n", - " {9853: '', 0: '', 53786: 'o '},\n", - " {0: ''},\n", - " {29925: '', 0: '', 74978: 'w '},\n", - " {9853: '', 0: '', 53786: 'o '},\n", - " {8646: '', 0: '', 13223: 'r '},\n", - " {74312: '', 0: '', 26137: 'l '},\n", - " {89472: '', 0: '', 104945: 'd '}],\n", - " [{'': 40876, '': 0, 'H ': 51965},\n", - " {'': 51907, '': 0, 'e ': 77296},\n", - " {'': 74312, '': 0, 'l ': 26137},\n", - " {'': 74312, '': 0, 'l ': 26137},\n", - " {'': 9853, '': 0, 'o ': 53786},\n", - " {'': 0},\n", - " {'': 29925, '': 0, 'w ': 74978},\n", - " {'': 9853, '': 0, 'o ': 53786},\n", - " {'': 8646, '': 0, 'r ': 13223},\n", - " {'': 74312, '': 0, 'l ': 26137},\n", - " {'': 89472, '': 0, 'd ': 104945}])" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ - "tokenizer.tokenize(\"Hello world\")" + "# `torchTextClassifiers`: a wrapper to handle them all" ] }, { "cell_type": "markdown", - "id": "6b0fd6c0-9740-4a32-9bb2-4a3cfe174ea8", + "id": "48", "metadata": {}, "source": [ - "Why creating a `NGramTokenizer` separately ? Because model constructor\n", - "is now independent from training data:" + "## Initialization" ] }, { "cell_type": "code", "execution_count": null, - "id": "ee5dbe0b", + "id": "49", "metadata": {}, "outputs": [], "source": [ - "from torchTextClassifiers import build_fasttext_from_tokenizer\n", + "### Two main config objects, that mirror the parameters used above - and you're good to go !\n", "\n", - "model = build_fasttext_from_tokenizer(\n", - " tokenizer, \n", - " embedding_dim=parameters[\"embedding_dim\"], \n", - " categorical_embedding_dims=parameters[\"categorical_embedding_dims\"], \n", - " sparse=parameters[\"sparse\"], \n", - " lr=parameters_train[\"lr\"], \n", - " num_classes=NUM_CLASSES, \n", - " num_categorical_features=NUM_CAT_VAR, \n", - " categorical_vocabulary_sizes=CAT_VOCAB_SIZE\n", - ")" + "model_config = ModelConfig(\n", + " embedding_dim=embedding_dim,\n", + " categorical_vocabulary_sizes=categorical_vocab_sizes,\n", + " categorical_embedding_dims=categorical_embedding_dims,\n", + " num_classes=num_classes,\n", + " attention_config=attention_config,\n", + ")\n", + "\n", + "training_config = TrainingConfig(\n", + " lr=1e-3,\n", + " batch_size=256,\n", + " num_epochs=10,\n", + ")\n", + "\n", + "ttc = torchTextClassifiers(\n", + " tokenizer=tokenizer,\n", + " model_config=model_config,\n", + ")\n", + "\n", + "## Given those parameters, the TextClassificationModel is created internally, with the right connections between components\n", + "ttc" ] }, { "cell_type": "markdown", - "id": "f53080e9-9d78-479f-a446-2feb4a92b1de", + "id": "50", "metadata": {}, "source": [ - "**Warning**:\n", - "\n", - "If the PyTorch model building did not use the training data, please keep\n", - "in mind that its architecture (that you customize here) should match the\n", - "vocabulary size of the categorical variables and the total number of\n", - "class, otherwise the model will raise an error during training.\n", - "\n", - "# Train a torchTextClassifiers FastText model directly\n", + "## Training" + ] + }, + { + "cell_type": "markdown", + "id": "51", + "metadata": {}, + "source": [ + "`torchTextClassifiers` has a `.train()` method that handles the whole training process for you:\n", "\n", - "If no advanced customization or PyTorch tuning is necessary, there is a\n", - "direct way of training model." + "- Init of dataset and dataloaders\n", + "- Init of Lightning module\n", + "- Training with early stopping and model checkpointing\n", + "- Using Lightning's Trainer under the hood" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "ce5dc4a1", + "execution_count": null, + "id": "52", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2025-07-23 12:38:57 - torchTextClassifiers.torchTextClassifiers - Starting training process...\n", - "2025-07-23 12:38:57 - torchTextClassifiers.torchTextClassifiers - Running on: cpu\n", - "2025-07-23 12:38:57 - torchTextClassifiers.classifiers.fasttext.model - Creating DataLoader with 12 workers.\n", - "2025-07-23 12:38:57 - torchTextClassifiers.classifiers.fasttext.model - Creating DataLoader with 12 workers.\n", - "GPU available: False, used: False\n", - "TPU available: False, using: 0 TPU cores\n", - "HPU available: False, using: 0 HPUs\n", - "/opt/python/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n", - "2025-07-23 12:38:57 - torchTextClassifiers.torchTextClassifiers - Launching training...\n", - "\n", - " | Name | Type | Params | Mode \n", - "-----------------------------------------------------------\n", - "0 | model | FastTextModel | 5.5 M | train\n", - "1 | loss | CrossEntropyLoss | 0 | train\n", - "2 | accuracy_fn | MulticlassAccuracy | 0 | train\n", - "-----------------------------------------------------------\n", - "5.5 M Trainable params\n", - "0 Non-trainable params\n", - "5.5 M Total params\n", - "21.842 Total estimated model params size (MB)\n", - "11 Modules in train mode\n", - "0 Modules in eval mode\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sanity Checking: | | 0/? [00:00 \u001b[39m\u001b[32m5\u001b[39m pred, conf = model.predict(X, top_k=TOP_K)\n\u001b[32m 6\u001b[39m pred_naf = encoder.inverse_transform(pred.reshape(-\u001b[32m1\u001b[39m))\n\u001b[32m 7\u001b[39m subset = naf2008.set_index(\u001b[33m\"\u001b[39m\u001b[33mcode\u001b[39m\u001b[33m\"\u001b[39m).loc[np.flip(pred_naf)]\n", - "\u001b[31mValueError\u001b[39m: not enough values to unpack (expected 2, got 1)" - ] - } - ], + "outputs": [], "source": [ - "text = [\"coiffeur, boulangerie, pâtisserie\"] # one text description\n", - "X= np.array([[text[0], 0, 0, 0, 0, 0, 0]]) # our new entry\n", - "TOP_K = 5\n", - "\n", - "pred, conf = model.predict(X, top_k=TOP_K)\n", - "pred_naf = encoder.inverse_transform(pred.reshape(-1))\n", - "subset = naf2008.set_index(\"code\").loc[np.flip(pred_naf)]\n", - "\n", - "for i in range(TOP_K-1, -1, -1):\n", - " print(f\"Prediction: {pred_naf[i]}, confidence: {conf[0, i]}, description: {subset['libelle'][pred_naf[i]]}\")" + "word_attributions = map_attributions_to_word(attributions, word_ids)\n", + "char_attributions = map_attributions_to_char(attributions, offsets, text)\n" ] }, { - "cell_type": "markdown", - "id": "f84e6bff-8fa7-4896-b60a-005ae5f1d3eb", + "cell_type": "code", + "execution_count": null, + "id": "57", "metadata": {}, + "outputs": [], "source": [ - "# Explainability\n", - "\n", - "The `torchTextClassifiers` framework provides explainability features through the `predict_and_explain` method. This allows you to understand which parts of the input text contribute most to the model's predictions." + "encoder.inverse_transform(np.array([predictions]).reshape(-1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58", + "metadata": {}, + "outputs": [], + "source": [ + "all_plots = plot_attributions_at_char(\n", + " text=text,\n", + " attributions_per_char=char_attributions,\n", + " titles = list(map(lambda x: f\"Attributions for code {x}\", encoder.inverse_transform(np.array([predictions]).reshape(-1)).tolist())),\n", + ")\n", + "figshow(all_plots[0])" ] }, { "cell_type": "code", - "execution_count": 22, - "id": "58c46021", + "execution_count": null, + "id": "59", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Predictions with explanations:\n", - "Predicted classes: tensor([[727]])\n", - "Confidence scores: tensor([[0.5400]])\n", - "Word-level explanation scores shape: \n", - "Letter-level explanation scores shape: torch.Size([1, 1, 31])\n" - ] - } - ], + "outputs": [], "source": [ - "# Note: Visualization functions are not yet implemented in torchTextClassifiers\n", - "# The predict_and_explain method is available but visualization utilities need to be implemented\n", - "\n", - "pred, conf, all_scores, all_scores_letters = model.predict_and_explain(X)\n", - "\n", - "# TODO: Implement visualization functions\n", - "# visualize_word_scores(all_scores, text, pred_naf.reshape(1, -1))\n", - "# visualize_letter_scores(all_scores_letters, text, pred_naf.reshape(1, -1))\n", - "\n", - "print(\"Predictions with explanations:\")\n", - "print(f\"Predicted classes: {pred}\")\n", - "print(f\"Confidence scores: {conf}\")\n", - "print(f\"Word-level explanation scores shape: {all_scores.shape if hasattr(all_scores, 'shape') else type(all_scores)}\")\n", - "print(f\"Letter-level explanation scores shape: {all_scores_letters.shape if hasattr(all_scores_letters, 'shape') else type(all_scores_letters)}\")" + "all_plots = plot_attributions_at_word(\n", + " text=text,\n", + " attributions_per_word=word_attributions,\n", + " titles = list(map(lambda x: f\"Attributions for code {x}\", encoder.inverse_transform(np.array([predictions]).reshape(-1)).tolist())),\n", + ")\n", + "figshow(all_plots[0])" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -1849,7 +809,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.5" + "version": "3.13.8" } }, "nbformat": 4, diff --git a/notebooks/example.qmd b/notebooks/example.qmd deleted file mode 100644 index 666c33b..0000000 --- a/notebooks/example.qmd +++ /dev/null @@ -1,499 +0,0 @@ ---- -title: "Exemple d'utilisation de la librairie `TorchFastText`" ---- - - - -_Warning_ - -_`TorchFastText` library is still under active development. Have a regular look to [https://github.com/inseefrlab/torch-fastText](https://github.com/inseefrlab/torch-fastText) for latest information._ - -To install package, you can run the following snippet - -```{python} -#| output: false -#| eval: false - -# Stable version -pip install torchFastText -# Development version -# pip install !https://github.com/InseeFrLab/torch-fastText.git -``` - -# Load and preprocess data - -In that guide, we propose to illustrate main package functionalities using that `DataFrame`: - -```{python} -import pandas as pd -df = pd.read_parquet("https://minio.lab.sspcloud.fr/projet-ape/extractions/20241027_sirene4.parquet") -df = df.sample(10000) -``` - -Our goal will be to build multilabel classification for the `code` variable using `libelle` as feature. - -## Enriching our test dataset - -Unlike `Fasttext`, this package offers the possibility of having several feature columns of different types (string for the text column and additional variables in numeric form, for example). To illustrate that, we propose the following enrichment of the example dataset: - - -```{python} -import pandas as pd -import numpy as np -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder - -def categorize_surface( - df: pd.DataFrame, surface_feature_name: int, like_sirene_3: bool = True -) -> pd.DataFrame: - """ - Categorize the surface of the activity. - - Args: - df (pd.DataFrame): DataFrame to categorize. - surface_feature_name (str): Name of the surface feature. - like_sirene_3 (bool): If True, categorize like Sirene 3. - - Returns: - pd.DataFrame: DataFrame with a new column "surf_cat". - """ - df_copy = df.copy() - df_copy[surface_feature_name] = df_copy[surface_feature_name].replace("nan", np.nan) - df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(float) - # Check surface feature exists - if surface_feature_name not in df.columns: - raise ValueError(f"Surface feature {surface_feature_name} not found in DataFrame.") - # Check surface feature is a float variable - if not (pd.api.types.is_float_dtype(df_copy[surface_feature_name])): - raise ValueError(f"Surface feature {surface_feature_name} must be a float variable.") - - if like_sirene_3: - # Categorize the surface - df_copy["surf_cat"] = pd.cut( - df_copy[surface_feature_name], - bins=[0, 120, 400, 2500, np.inf], - labels=["1", "2", "3", "4"], - ).astype(str) - else: - # Log transform the surface - df_copy["surf_log"] = np.log(df[surface_feature_name]) - - # Categorize the surface - df_copy["surf_cat"] = pd.cut( - df_copy.surf_log, - bins=[0, 3, 4, 5, 12], - labels=["1", "2", "3", "4"], - ).astype(str) - - df_copy[surface_feature_name] = df_copy["surf_cat"].replace("nan", "0") - df_copy[surface_feature_name] = df_copy[surface_feature_name].astype(int) - df_copy = df_copy.drop(columns=["surf_log", "surf_cat"], errors="ignore") - return df_copy - - -def clean_and_tokenize_df( - df, - categorical_features=["EVT", "CJ", "NAT", "TYP", "CRT"], - text_feature="libelle_processed", - label_col="apet_finale", -): - df.fillna("nan", inplace=True) - - df = df.rename( - columns={ - "evenement_type": "EVT", - "cj": "CJ", - "activ_nat_et": "NAT", - "liasse_type": "TYP", - "activ_surf_et": "SRF", - "activ_perm_et": "CRT", - } - ) - - les = [] - for col in categorical_features: - le = LabelEncoder() - df[col] = le.fit_transform(df[col]) - les.append(le) - - df = categorize_surface(df, "SRF", like_sirene_3=True) - df = df[[text_feature, "EVT", "CJ", "NAT", "TYP", "SRF", "CRT", label_col]] - - return df, les - - -def stratified_split_rare_labels(X, y, test_size=0.2, min_train_samples=1): - # Get unique labels and their frequencies - unique_labels, label_counts = np.unique(y, return_counts=True) - - # Separate rare and common labels - rare_labels = unique_labels[label_counts == 1] - - # Create initial mask for rare labels to go into training set - rare_label_mask = np.isin(y, rare_labels) - - # Separate data into rare and common label datasets - X_rare = X[rare_label_mask] - y_rare = y[rare_label_mask] - X_common = X[~rare_label_mask] - y_common = y[~rare_label_mask] - - # Split common labels stratified - X_common_train, X_common_test, y_common_train, y_common_test = train_test_split( - X_common, y_common, test_size=test_size, stratify=y_common - ) - - # Combine rare labels with common labels split - X_train = np.concatenate([X_rare, X_common_train]) - y_train = np.concatenate([y_rare, y_common_train]) - X_test = X_common_test - y_test = y_common_test - - return X_train, X_test, y_train, y_test - -def add_libelles( - df: pd.DataFrame, - df_naf: pd.DataFrame, - y: str, - text_feature: str, - textual_features: list, - categorical_features: list, -): - missing_codes = set(df_naf["code"]) - fake_obs = df_naf[df_naf["code"].isin(missing_codes)] - fake_obs[y] = fake_obs["code"] - fake_obs[text_feature] = fake_obs[[text_feature]].apply( - lambda row: " ".join(f"[{col}] {val}" for col, val in row.items() if val != ""), axis=1 - ) - df = pd.concat([df, fake_obs[[col for col in fake_obs.columns if col in df.columns]]]) - - if textual_features is not None: - for feature in textual_features: - df[feature] = df[feature].fillna(value="") - if categorical_features is not None: - for feature in categorical_features: - df[feature] = df[feature].fillna(value="NaN") - - print(f"\t*** {len(missing_codes)} codes have been added in the database...\n") - return df -``` - -```{python} -categorical_features = ["evenement_type", "cj", "activ_nat_et", "liasse_type", "activ_surf_et", "activ_perm_et"] -text_feature = "libelle" -y = "apet_finale" -textual_features = None - -naf2008 = pd.read_csv("https://minio.lab.sspcloud.fr/projet-ape/data/naf2008.csv", sep=";") -df = add_libelles(df, naf2008, y, text_feature, textual_features, categorical_features) -``` - - -## Preprocessing - -To reduce noise in text fields, we recommend pre-processing before training a model with our package. We assume this preprocessing is handled by the package user : this gives him the opportunity to control data cleansing. - -Here's an example of the type of preprocessing that can be carried out before moving on to the modeling phase - -```{python} -from torchFastText.preprocess import clean_text_feature -df["libelle_processed"] = clean_text_feature(df["libelle"]) -``` - -Right now, the model requires the label (variable y) to be a numerical variable. If the label variable is a text variable, we recommend using Scikit Learn's [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) to convert into a numeric variable. Using that function will give user the possibility to get back labels from the encoder after running predictions. - -```{python} -encoder = LabelEncoder() -df["apet_finale"] = encoder.fit_transform(df["apet_finale"]) -``` - -The function `clean_and_tokenize_df` requires special `DataFrame` formatting: - -- First column contains the processed text (str) -- Next ones contain the "encoded" categorical (discrete) variables in int format - - -```{python} -df, _ = clean_and_tokenize_df(df, text_feature="libelle_processed") -X = df[["libelle_processed", "EVT", "CJ", "NAT", "TYP", "CRT", "SRF"]].values -y = df["apet_finale"].values -``` - -## Splitting in train-test sets - -As usual in a learning approach, you need to break down your data into learning and test/validation samples to obtain robust performance statistics. - -This work is the responsibility of the package's users. Here's an example of how to do it, using the [`train_test_split`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) function in `Scikit`. - -```{python} -from sklearn.model_selection import train_test_split -X_train, X_test, y_train, y_test = train_test_split(X, y) -``` - -# Build the torch-fastText model (without training it) - -There are several ways to define and train a pytorch.fasttext model in this package. - -We first show how to initialize the model and then afterwars build it. - -`torchFastText` function accepts the following parameters: - -| Parameter | Meaning | Example Value | -|---------------------------------|---------------------------------------------------------------------|--------------| -| `num_tokens` | Number of rows in the embedding matrix (size of the vocabulary) | 100000 | -| `embedding_dim` | Dimension of the embedding (number of columns in the matrix) | 50 | -| `sparse` | Use sparse embedding for fast computation (PyTorch) | False | -| `categorical_embedding_dims` | Dimension of the embedding for categorical features | 10 | -| `min_count` | Minimum occurrences of a word in the corpus to be included | 1 | -| `min_n` | Minimum length of character n-grams | 3 | -| `max_n` | Maximum length of character n-grams | 6 | -| `len_word_ngrams` | Length of word n-grams | 3 | - - -```{python} -from torchFastText import torchFastText - -parameters = { - "num_tokens": 100000, - "embedding_dim": 50, - "sparse": False, - "categorical_embedding_dims": 10, - "min_count": 1, - "min_n": 3, - "max_n": 6, - "len_word_ngrams": 3, -} - -parameters_train = { - "lr": 0.004, - "num_epochs": 1, - "batch_size": 256, - "patience": 3 -} - -model = torchFastText(**parameters) -``` - -`model` is then a special `torchFastText` object: - -```{python} -type(model) -``` - -As any `PyTorch` model, it accepts being save as a JSON for later on use: - -```{python} -model.to_json('torchFastText_config.json') -# model = torchFastText.from_json('torchFastText_config.json') -``` - -We can apply `build` to finally train our model. These are the parameters accepted by the `build` method - -| Parameter | Meaning | Example Value | -|---------------------------------|---------------------------------------------------------------------|--------------| -| `lr` | Learning rate | 0.004 | -| `num_epochs` | Number of training epochs | 1 | -| `batch_size` | Batch size for training | 256 | -| `patience` | Early stopping patience (number of epochs without improvement) | 3 | - - -We build the model using the training data. -We have now access to the tokenizer, the PyTorch model as well as a PyTorch Lightning module ready to be trained. -Note that Lightning is high-level framework for PyTorch that simplifies the process of training, validating, and deploying machine learning models. - - -```{python} -model.build(X_train, y_train, lightning=True, lr=parameters_train.get("lr")) -``` - -One can retrieve different objects from `model` instance: - -* `model.pytorch_model` -* `model.tokenizer` -* `model.lightning_module` - - -```{python} -model.pytorch_model -``` - -```{python} -model.tokenizer -``` - -```{python} -model.lightning_module -``` - -One can also retrieve more precise information regarding the tokenizer. This can be useful to know how text is parsed before being given to the neural network: - - -```{python} -from pprint import pprint -sentence = ["lorem ipsum dolor sit amet"] -pprint(model.tokenizer.tokenize(sentence)[2][0]) -``` - - -Saving parameters to JSON can also be done after building, but the model needs to be rebuilt after loading. - -```{python} -model.to_json('torchFastText_config.json') -``` - - -## Alternative way to build torchFastText - -The training data is only useful to initialize the tokenizer, but X_train and y_train are not needed to initialize the PyTorch model, provided we give the right parameters to construct layer. - -To highlight this, we provide a lower-level process to build the model where one can first build the tokenizer, and then build the model with custom architecture parameters. - -The tokenizer can be loaded **from the same JSON file** as the model parameters, or initialized using the right arguments. - - -```{python} -del model -``` - -Let's decompose our features in two group: - -* We have our textual feature stored in the first column of the features matrix -* All other columns are categorical variables - -```{python} -training_text = X_train[:, 0].tolist() -categorical_variables = X_train[:, 1:] -``` - -We need to create a few variables that will be useful afterwards - -```{python} -CAT_VOCAB_SIZE = (np.max(categorical_variables, axis=0) + 1).astype(int).tolist() -NUM_CLASSES = len(np.unique(y_train)) -NUM_CAT_VAR = categorical_variables.shape[1] -``` - -Now let's come to the nitty gritty. There are several ways to create an instance of the tokenizer. - -First, we can create the tokenizer from : - -* model definition in the JSON file created beforehand -* textual data in training dataset - -```{python} -from torchFastText.datasets import NGramTokenizer -tokenizer = NGramTokenizer.from_json('torchFastText_config.json', training_text) -``` - -```{python} -tokenizer.tokenize("Hello world") -``` - -However, there is a more straightforward way to do: creating directly the `NGramTokenizer` instance: - - -```{python} -tokenizer = NGramTokenizer( - **parameters, - training_text=training_text - ) -``` - -```{python} -tokenizer.tokenize("Hello world") -``` - -Why creating a `NGramTokenizer` separately ? Because model constructor is now independent from training data: - -```{python} -#| echo: false -#| eval: false -# TODO : allow to do that -#torchFastText.build_from_tokenizer( - #tokenizer, - #**parameters, - #**parameters_build -# ) -``` - -```{python} -model = torchFastText.build_from_tokenizer( - tokenizer, - embedding_dim=parameters["embedding_dim"], - categorical_embedding_dims=parameters["categorical_embedding_dims"], - sparse=parameters["sparse"], - lr=parameters_train["lr"], - num_classes=NUM_CLASSES, - num_categorical_features=NUM_CAT_VAR, - categorical_vocabulary_sizes=CAT_VOCAB_SIZE -) -``` - -__Warning__: - -If the PyTorch model building did not use the training data, please keep in mind that its architecture (that you customize here) should match the vocabulary size of the categorical variables and the total number of class, otherwise the model will raise an error during training. - - -# Train a torchFastText model directly - -If no advanced customization or PyTorch tuning is necessary, there is a direct way of training model. - - -```{python} -#| eval: false -model.train( - X_train, - y_train, - X_test, - y_test, - num_epochs=parameters_train['num_epochs'], - batch_size=parameters_train['batch_size'], - patience_scheduler=parameters_train['patience'], - patience_train=parameters_train['patience'], - lr=parameters_train['lr'], - verbose = True -) -``` - -# Load a trained model from a Lightning checkpoint - -/!\ TOCOMPLETE - - -```{python} -#| eval: false -model.load_from_checkpoint(model.best_model_path) # or any other checkpoint path (string) -``` - -# Predicting from new labels - - -```{python} -#| eval: false -text = ["coiffeur, boulangerie, pâtisserie"] # one text description -X= np.array([[text[0], 0, 0, 0, 0, 0, 0]]) # our new entry -TOP_K = 5 - -pred, conf = model.predict(X, top_k=TOP_K) -pred_naf = encoder.inverse_transform(pred.reshape(-1)) -subset = naf2008.set_index("code").loc[np.flip(pred_naf)] - -for i in range(TOP_K-1, -1, -1): - print(f"Prediction: {pred_naf[i]}, confidence: {conf[0, i]}, description: {subset['libelle'][pred_naf[i]]}") - -``` - -# Explainability - - -```{python} -#| eval: false -from torchFastText.explainability.visualisation import ( - visualize_letter_scores, - visualize_word_scores, -) - -pred, conf, all_scores, all_scores_letters = model.predict_and_explain(X) -visualize_word_scores(all_scores, text, pred_naf.reshape(1, -1)) -visualize_letter_scores(all_scores_letters, text, pred_naf.reshape(1, -1)) -``` \ No newline at end of file diff --git a/notebooks/torchFastText_config.json b/notebooks/torchFastText_config.json deleted file mode 100644 index ff40f49..0000000 --- a/notebooks/torchFastText_config.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "embedding_dim": 50, - "sparse": false, - "num_tokens": 100000, - "min_count": 1, - "min_n": 3, - "max_n": 6, - "len_word_ngrams": 3, - "num_classes": 646, - "num_rows": 107992, - "categorical_vocabulary_sizes": [ - 24, - 40, - 8, - 13, - 3, - 4 - ], - "categorical_embedding_dims": 10, - "num_categorical_features": 6, - "direct_bagging": true -} \ No newline at end of file diff --git a/notebooks/utils.py b/notebooks/utils.py index 19c4734..f458581 100644 --- a/notebooks/utils.py +++ b/notebooks/utils.py @@ -1,8 +1,8 @@ -import pandas as pd import numpy as np -from sklearn.model_selection import train_test_split +import pandas as pd from sklearn.preprocessing import LabelEncoder + def categorize_surface( df: pd.DataFrame, surface_feature_name: int, like_sirene_3: bool = True ) -> pd.DataFrame: @@ -53,23 +53,11 @@ def categorize_surface( def clean_and_tokenize_df( df, - categorical_features=["EVT", "CJ", "NAT", "TYP", "CRT"], + categorical_features=["CJ", "NAT", "TYP", "CRT"], text_feature="libelle_processed", label_col="apet_finale", ): df.fillna("nan", inplace=True) - - df = df.rename( - columns={ - "evenement_type": "EVT", - "cj": "CJ", - "activ_nat_et": "NAT", - "liasse_type": "TYP", - "activ_surf_et": "SRF", - "activ_perm_et": "CRT", - } - ) - les = [] for col in categorical_features: le = LabelEncoder() @@ -77,62 +65,6 @@ def clean_and_tokenize_df( les.append(le) df = categorize_surface(df, "SRF", like_sirene_3=True) - df = df[[text_feature, "EVT", "CJ", "NAT", "TYP", "SRF", "CRT", label_col]] + df = df[[text_feature, "CJ", "NAT", "TYP", "SRF", "CRT", label_col]] return df, les - - -def stratified_split_rare_labels(X, y, test_size=0.2, min_train_samples=1): - # Get unique labels and their frequencies - unique_labels, label_counts = np.unique(y, return_counts=True) - - # Separate rare and common labels - rare_labels = unique_labels[label_counts == 1] - - # Create initial mask for rare labels to go into training set - rare_label_mask = np.isin(y, rare_labels) - - # Separate data into rare and common label datasets - X_rare = X[rare_label_mask] - y_rare = y[rare_label_mask] - X_common = X[~rare_label_mask] - y_common = y[~rare_label_mask] - - # Split common labels stratified - X_common_train, X_common_test, y_common_train, y_common_test = train_test_split( - X_common, y_common, test_size=test_size, stratify=y_common - ) - - # Combine rare labels with common labels split - X_train = np.concatenate([X_rare, X_common_train]) - y_train = np.concatenate([y_rare, y_common_train]) - X_test = X_common_test - y_test = y_common_test - - return X_train, X_test, y_train, y_test - -def add_libelles( - df: pd.DataFrame, - df_naf: pd.DataFrame, - y: str, - text_feature: str, - textual_features: list, - categorical_features: list, -): - missing_codes = set(df_naf["code"]) - fake_obs = df_naf[df_naf["code"].isin(missing_codes)] - fake_obs[y] = fake_obs["code"] - fake_obs[text_feature] = fake_obs[[text_feature]].apply( - lambda row: " ".join(f"[{col}] {val}" for col, val in row.items() if val != ""), axis=1 - ) - df = pd.concat([df, fake_obs[[col for col in fake_obs.columns if col in df.columns]]]) - - if textual_features is not None: - for feature in textual_features: - df[feature] = df[feature].fillna(value="") - if categorical_features is not None: - for feature in categorical_features: - df[feature] = df[feature].fillna(value="NaN") - - print(f"\t*** {len(missing_codes)} codes have been added in the database...\n") - return df \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 4d7aa65..47449ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,9 @@ [project] name = "torchtextclassifiers" -description = "An implementation of the https://github.com/facebookresearch/fastText supervised learning algorithm for text classification using Pytorch." +description = "A text classification toolkit to easily build, train and evaluate deep learning text classifiers using PyTorch." authors = [ - { name = "Tom Seimandi", email = "tom.seimandi@gmail.com" }, - { name = "Julien Pramil", email = "julien.pramil@insee.fr" }, - { name = "Meilame Tayebjee", email = "meilame.tayebjee@insee.fr" }, { name = "Cédric Couralet", email = "cedric.couralet@insee.fr" }, + { name = "Meilame Tayebjee", email = "meilame.tayebjee@insee.fr" }, ] readme = "README.md" repository = "https://github.com/InseeFrLab/torchTextClassifiers" @@ -31,7 +29,10 @@ dev = [ "nltk", "unidecode", "captum", - "pyarrow" + "pyarrow", + "pre-commit>=4.3.0", + "ruff>=0.14.3", + "ipywidgets>=8.1.8", ] docs = [ "sphinx>=5.0.0", @@ -46,6 +47,12 @@ docs = [ [project.optional-dependencies] explainability = ["unidecode", "nltk", "captum"] preprocess = ["unidecode", "nltk"] +huggingface = [ + "tokenizers>=0.22.1", + "transformers>=4.57.1", + "datasets>=4.3.0", +] + [build-system] requires = ["uv_build>=0.9.3,<0.10.0"] @@ -58,6 +65,3 @@ line-length = 100 [tool.uv.build-backend] module-name="torchTextClassifiers" module-root = "" - - - diff --git a/tests/benchmark_suite.py b/tests/benchmark_suite.py new file mode 100644 index 0000000..b59d7be --- /dev/null +++ b/tests/benchmark_suite.py @@ -0,0 +1,263 @@ +""" +Simplified benchmark suite for comparing tokenizers +""" + +import random +import time +from typing import Any, Dict, List + +import matplotlib.pyplot as plt +import numpy as np + +from torchTextClassifiers.tokenizers.ngram import NGramTokenizer +from torchTextClassifiers.tokenizers.WordPiece import WordPieceTokenizer + +# ============================================================================ +# Test Data Generation +# ============================================================================ + + +def generate_test_data(num_samples: int, avg_length: int = 50) -> List[str]: + """Generate synthetic text data.""" + words = [ + "the", + "quick", + "brown", + "fox", + "jumps", + "over", + "lazy", + "dog", + "machine", + "learning", + "artificial", + "intelligence", + "neural", + "network", + "tokenizer", + "optimization", + "performance", + "benchmark", + "testing", + "python", + "pytorch", + "numpy", + "data", + "processing", + "model", + ] + + sentences = [] + for _ in range(num_samples): + length = max(5, int(np.random.normal(avg_length, avg_length // 4))) + sentence = " ".join(random.choices(words, k=length)) + sentences.append(sentence) + + return sentences + + +# ============================================================================ +# Simple Benchmark +# ============================================================================ + + +def benchmark_tokenizer(tokenizer, data: List[str], name: str, runs: int = 3) -> Dict: + """Benchmark a single tokenizer on data.""" + + # Warmup + _ = tokenizer.tokenize(data[:10]) + + # Benchmark + times = [] + for _ in range(runs): + start = time.perf_counter() + _ = tokenizer.tokenize(data) + elapsed = time.perf_counter() - start + times.append(elapsed) + + mean_time = np.mean(times) + throughput = len(data) / mean_time + + return { + "name": name, + "time": mean_time, + "std": np.std(times), + "throughput": throughput, + "times": times, + } + + +def compare_tokenizers(tokenizers: Dict[str, Any], batch_sizes: List[int] = None): + """ + Compare multiple tokenizers across different batch sizes. + + Args: + tokenizers: Dict with {name: tokenizer_instance} + batch_sizes: List of batch sizes to test + """ + + if batch_sizes is None: + batch_sizes = [100, 500, 1000, 2000] + + print("=" * 80) + print("TOKENIZER COMPARISON") + print("=" * 80) + + results = {name: [] for name in tokenizers.keys()} + + for batch_size in batch_sizes: + print(f"\n--- Batch Size: {batch_size} ---") + test_data = generate_test_data(batch_size) + + batch_results = [] + for name, tokenizer in tokenizers.items(): + try: + result = benchmark_tokenizer(tokenizer, test_data, name) + results[name].append(result) + + print( + f"{name:20s}: {result['time']:.3f}s ± {result['std']:.3f}s " + f"({result['throughput']:.0f} samples/sec)" + ) + batch_results.append(result) + + except Exception as e: + print(f"{name:20s}: FAILED - {e}") + + # Show speedup + if len(batch_results) > 1: + fastest = min(batch_results, key=lambda x: x["time"]) + slowest = max(batch_results, key=lambda x: x["time"]) + speedup = slowest["time"] / fastest["time"] + print(f"\n → {fastest['name']} is {speedup:.2f}x faster than {slowest['name']}") + + return results + + +def plot_comparison(results: Dict[str, List[Dict]], save_path: str = "comparison.png"): + """Plot comparison results.""" + + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) + + # Plot 1: Throughput vs Batch Size + for name, data in results.items(): + if not data: + continue + batch_sizes = [d["throughput"] / d["time"] * 1000 for d in data] # rough estimate + throughputs = [d["throughput"] for d in data] + ax1.plot(batch_sizes, throughputs, marker="o", label=name, linewidth=2) + + ax1.set_xlabel("Batch Size") + ax1.set_ylabel("Throughput (samples/sec)") + ax1.set_title("Throughput Comparison") + ax1.legend() + ax1.grid(True, alpha=0.3) + + # Plot 2: Time comparison (last batch size) + names = [] + times = [] + colors = [] + + for i, (name, data) in enumerate(results.items()): + if data: + names.append(name) + times.append(data[-1]["time"]) + colors.append(f"C{i}") + + if times: + bars = ax2.barh(range(len(names)), times, color=colors, alpha=0.7) + ax2.set_yticks(range(len(names))) + ax2.set_yticklabels(names) + ax2.set_xlabel("Time (seconds)") + ax2.set_title("Processing Time Comparison") + ax2.grid(True, alpha=0.3, axis="x") + + # Add value labels + for i, (bar, t) in enumerate(zip(bars, times)): + ax2.text(t + 0.01, i, f"{t:.3f}s", va="center") + + # Mark fastest + fastest_idx = times.index(min(times)) + ax2.get_yticklabels()[fastest_idx].set_weight("bold") + ax2.get_yticklabels()[fastest_idx].set_color("green") + + plt.tight_layout() + plt.savefig(save_path, dpi=150, bbox_inches="tight") + print(f"\n✓ Plot saved to {save_path}") + plt.close() + + +def print_summary(results: Dict[str, List[Dict]]): + """Print summary statistics.""" + + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + + # Get last batch results (largest) + last_batch = {name: data[-1] for name, data in results.items() if data} + + if not last_batch: + print("No results to summarize") + return + + fastest = min(last_batch.items(), key=lambda x: x[1]["time"]) + slowest = max(last_batch.items(), key=lambda x: x[1]["time"]) + + print(f"\n🏆 Winner: {fastest[0]}") + print(f" Time: {fastest[1]['time']:.3f}s") + print(f" Throughput: {fastest[1]['throughput']:.0f} samples/sec") + + if len(last_batch) > 1: + speedup = slowest[1]["time"] / fastest[1]["time"] + print(f"\n {speedup:.2f}x faster than {slowest[0]}") + + print("\n" + "-" * 80) + print("All tokenizers (sorted by speed):") + for name, result in sorted(last_batch.items(), key=lambda x: x[1]["time"]): + speedup = slowest[1]["time"] / result["time"] + print(f" {name:20s}: {result['time']:.3f}s ({speedup:.2f}x)") + + +if __name__ == "__main__": + """ + Simple usage example: + + 1. Train your tokenizers + 2. Put them in a dict + 3. Run comparison + """ + + print("Training tokenizers...") + training_data = generate_test_data(1000, avg_length=30) + + # Create tokenizers + tokenizers = {} + + # NGram tokenizer + tokenizers["NGram"] = NGramTokenizer( + min_count=2, + min_n=2, + max_n=4, + num_tokens=10000, + len_word_ngrams=2, + training_text=training_data, + ) + + # WordPiece tokenizer + wp = WordPieceTokenizer(vocab_size=10000) + wp.train(training_corpus=training_data) + tokenizers["WordPiece"] = wp + + print(f"\n✓ Trained {len(tokenizers)} tokenizers\n") + + # Run comparison + results = compare_tokenizers(tokenizers, batch_sizes=[100, 500, 1000]) + + # Plot results + plot_comparison(results) + + # Print summary + print_summary(results) + + print("\n✓ Benchmark complete!") diff --git a/tests/conftest.py b/tests/conftest.py index cde1c4e..4023570 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,19 +1,22 @@ -import pytest +from unittest.mock import Mock + import numpy as np -from unittest.mock import Mock, MagicMock +import pytest @pytest.fixture def sample_text_data(): """Sample text data for testing.""" - return np.array([ - "This is a positive example", - "This is a negative example", - "Another positive case", - "Another negative case", - "Good example here", - "Bad example here" - ]) + return np.array( + [ + "This is a positive example", + "This is a negative example", + "Another positive case", + "Another negative case", + "Good example here", + "Bad example here", + ] + ) @pytest.fixture @@ -25,14 +28,7 @@ def sample_labels(): @pytest.fixture def sample_categorical_data(): """Sample categorical data for testing.""" - return np.array([ - [1, 2], - [2, 1], - [1, 3], - [3, 1], - [2, 2], - [3, 3] - ]) + return np.array([[1, 2], [2, 1], [1, 3], [3, 1], [2, 2], [3, 3]]) @pytest.fixture @@ -48,33 +44,32 @@ def sample_X_text_only(sample_text_data): @pytest.fixture -def fasttext_config(): - """Mock FastText configuration.""" - from torchTextClassifiers.classifiers.fasttext.core import FastTextConfig - - config = FastTextConfig( +def model_config(): + """Mock model configuration.""" + from torchTextClassifiers import ModelConfig + + config = ModelConfig( embedding_dim=10, - sparse=False, - num_tokens=1000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2, - num_classes=2 + categorical_vocabulary_sizes=[4, 5], + categorical_embedding_dims=[3, 4], + num_classes=10, ) return config @pytest.fixture def mock_tokenizer(): - """Mock NGramTokenizer for testing.""" + """Mock BaseTokenizer for testing.""" tokenizer = Mock() - tokenizer.min_count = 1 - tokenizer.min_n = 3 - tokenizer.max_n = 6 - tokenizer.num_tokens = 1000 - tokenizer.word_ngrams = 2 - tokenizer.padding_index = 999 + tokenizer.vocab_size = 1000 + tokenizer.padding_idx = 1 + tokenizer.tokenize = Mock( + return_value={ + "input_ids": np.array([[1, 2, 3], [4, 5, 6]]), + "attention_mask": np.array([[1, 1, 1], [1, 1, 1]]), + } + ) + tokenizer.output_dim = 50 return tokenizer @@ -108,4 +103,4 @@ def mock_dataset(): @pytest.fixture def mock_dataloader(): """Mock dataloader for testing.""" - return Mock() \ No newline at end of file + return Mock() diff --git a/tests/test_base_classes.py b/tests/test_base_classes.py deleted file mode 100644 index 1d7d7dc..0000000 --- a/tests/test_base_classes.py +++ /dev/null @@ -1,256 +0,0 @@ -import pytest -import numpy as np -from abc import ABC -from unittest.mock import Mock - -from torchTextClassifiers.classifiers.base import BaseClassifierConfig, BaseClassifierWrapper - - -class TestBaseClassifierConfig: - """Test the BaseClassifierConfig abstract base class.""" - - def test_cannot_instantiate_abstract_class(self): - """Test that BaseClassifierConfig cannot be instantiated directly.""" - with pytest.raises(TypeError): - BaseClassifierConfig() - - def test_concrete_implementation_required_methods(self): - """Test that concrete implementations must provide required methods.""" - - class ConcreteConfig(BaseClassifierConfig): - def __init__(self, value): - self.value = value - - def to_dict(self): - return {"value": self.value} - - @classmethod - def from_dict(cls, data): - return cls(data["value"]) - - # Should work with all methods implemented - config = ConcreteConfig(42) - assert config.value == 42 - - # Test serialization - config_dict = config.to_dict() - assert config_dict == {"value": 42} - - # Test deserialization - restored_config = ConcreteConfig.from_dict(config_dict) - assert restored_config.value == 42 - - def test_incomplete_implementation_fails(self): - """Test that incomplete implementations cannot be instantiated.""" - - class IncompleteConfig(BaseClassifierConfig): - def to_dict(self): - return {} - # Missing from_dict method - - with pytest.raises(TypeError): - IncompleteConfig() - - -class TestBaseClassifierWrapper: - """Test the BaseClassifierWrapper abstract base class.""" - - def test_cannot_instantiate_abstract_class(self): - """Test that BaseClassifierWrapper cannot be instantiated directly.""" - mock_config = Mock() - with pytest.raises(TypeError): - BaseClassifierWrapper(mock_config) - - def test_concrete_implementation_initialization(self): - """Test that concrete implementations can be initialized.""" - - class ConcreteWrapper(BaseClassifierWrapper): - def prepare_text_features(self, training_text): - pass # Mock implementation - - def _build_pytorch_model(self): - self.pytorch_model = "mock_model" - - def _check_and_init_lightning(self, **kwargs): - self.lightning_module = "mock_lightning" - - def predict(self, X, **kwargs): - return np.array([1, 0, 1]) - - def validate(self, X, Y, **kwargs): - return 0.85 - - def create_dataset(self, texts, labels, categorical_variables=None): - return "mock_dataset" - - def create_dataloader(self, dataset, batch_size, num_workers=0, shuffle=True): - return "mock_dataloader" - - def load_best_model(self, checkpoint_path): - self.trained = True - - @classmethod - def get_config_class(cls): - return Mock - - mock_config = Mock() - wrapper = ConcreteWrapper(mock_config) - - # Test initialization - assert wrapper.config == mock_config - assert wrapper.pytorch_model is None - assert wrapper.lightning_module is None - assert wrapper.trained == False - assert wrapper.device is None - - def test_concrete_implementation_methods(self): - """Test that concrete implementations can use all methods.""" - - class ConcreteWrapper(BaseClassifierWrapper): - def prepare_text_features(self, training_text): - pass # Mock implementation - - def _build_pytorch_model(self): - self.pytorch_model = "pytorch_model" - - def _check_and_init_lightning(self, **kwargs): - self.lightning_module = f"lightning_with_{kwargs}" - - def predict(self, X, **kwargs): - return np.array([1] * len(X)) - - def validate(self, X, Y, **kwargs): - return float(np.mean(Y)) - - def create_dataset(self, texts, labels, categorical_variables=None): - return { - "texts": texts, - "labels": labels, - "categorical": categorical_variables - } - - def create_dataloader(self, dataset, batch_size, num_workers=0, shuffle=True): - return { - "dataset": dataset, - "batch_size": batch_size, - "num_workers": num_workers, - "shuffle": shuffle - } - - def load_best_model(self, checkpoint_path): - self.trained = True - self.pytorch_model = f"model_from_{checkpoint_path}" - - @classmethod - def get_config_class(cls): - return Mock - - mock_config = Mock() - wrapper = ConcreteWrapper(mock_config) - - # Test prepare_text_features - sample_texts = np.array(["text1", "text2", "text3"]) - wrapper.prepare_text_features(sample_texts) - # Mock implementation doesn't set anything, so just test it doesn't crash - - # Test _build_pytorch_model - wrapper._build_pytorch_model() - assert wrapper.pytorch_model == "pytorch_model" - - # Test _check_and_init_lightning - wrapper._check_and_init_lightning(learning_rate=0.01) - assert "learning_rate" in str(wrapper.lightning_module) - - # Test predict - X = np.array(["test1", "test2"]) - predictions = wrapper.predict(X) - np.testing.assert_array_equal(predictions, np.array([1, 1])) - - # Test validate - Y = np.array([0, 1, 1, 0]) - accuracy = wrapper.validate(X, Y) - assert accuracy == 0.5 - - # Test create_dataset - labels = np.array([1, 0]) - dataset = wrapper.create_dataset(sample_texts[:2], labels) - np.testing.assert_array_equal(dataset["texts"], sample_texts[:2]) - np.testing.assert_array_equal(dataset["labels"], labels) - assert dataset["categorical"] is None - - # Test create_dataset with categorical - categorical = np.array([[1, 2], [3, 4]]) - dataset_with_cat = wrapper.create_dataset(sample_texts[:2], labels, categorical) - np.testing.assert_array_equal(dataset_with_cat["categorical"], categorical) - - # Test create_dataloader - dataloader = wrapper.create_dataloader(dataset, batch_size=32, num_workers=4, shuffle=False) - assert dataloader["dataset"] == dataset - assert dataloader["batch_size"] == 32 - assert dataloader["num_workers"] == 4 - assert dataloader["shuffle"] == False - - # Test load_best_model - checkpoint_path = "/path/to/checkpoint" - wrapper.load_best_model(checkpoint_path) - assert wrapper.trained == True - assert wrapper.pytorch_model == f"model_from_{checkpoint_path}" - - def test_incomplete_implementation_fails(self): - """Test that incomplete implementations cannot be instantiated.""" - - class IncompleteWrapper(BaseClassifierWrapper): - def prepare_text_features(self, training_text): - pass - - def _build_pytorch_model(self): - pass - - def _check_and_init_lightning(self, **kwargs): - pass - - def predict(self, X, **kwargs): - return np.array([]) - - # Missing: validate, create_dataset, create_dataloader, load_best_model - - mock_config = Mock() - with pytest.raises(TypeError): - IncompleteWrapper(mock_config) - - def test_method_signatures(self): - """Test that abstract methods have correct signatures.""" - - class ConcreteWrapper(BaseClassifierWrapper): - def prepare_text_features(self, training_text: np.ndarray) -> None: - pass - - def _build_pytorch_model(self) -> None: - pass - - def _check_and_init_lightning(self, **kwargs) -> None: - pass - - def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: - return np.array([]) - - def validate(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - return 0.0 - - def create_dataset(self, texts: np.ndarray, labels: np.ndarray, categorical_variables=None): - return None - - def create_dataloader(self, dataset, batch_size: int, num_workers: int = 0, shuffle: bool = True): - return None - - def load_best_model(self, checkpoint_path: str) -> None: - pass - - @classmethod - def get_config_class(cls): - return Mock - - # Should be able to instantiate with all methods implemented - mock_config = Mock() - wrapper = ConcreteWrapper(mock_config) - assert wrapper is not None \ No newline at end of file diff --git a/tests/test_core_functionality.py b/tests/test_core_functionality.py deleted file mode 100644 index a307e64..0000000 --- a/tests/test_core_functionality.py +++ /dev/null @@ -1,214 +0,0 @@ -import pytest -import numpy as np -import json -import tempfile -import os -from unittest.mock import Mock, patch, MagicMock - - -def test_basic_imports(): - """Test that core modules can be imported without torch dependencies.""" - # Test that the main class can be imported - from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers - assert torchTextClassifiers is not None - - -def test_wrapper_based_pattern(): - """Test the wrapper-based pattern without actual implementations.""" - from torchTextClassifiers.classifiers.base import BaseClassifierWrapper, BaseClassifierConfig - - # Test that base classes exist and have expected structure - assert hasattr(BaseClassifierWrapper, 'prepare_text_features') - assert hasattr(BaseClassifierWrapper, '_build_pytorch_model') - assert hasattr(BaseClassifierWrapper, 'get_config_class') - assert hasattr(BaseClassifierConfig, 'to_dict') - assert hasattr(BaseClassifierConfig, 'from_dict') - - -def test_class_structure(): - """Test that the main class has the expected class structure.""" - # We can test the class structure without instantiating - from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers - - # Check that it has the expected methods defined - assert hasattr(torchTextClassifiers, '__init__') - assert hasattr(torchTextClassifiers, 'build_tokenizer') # backward compatibility - assert hasattr(torchTextClassifiers, 'prepare_text_features') # new method - assert hasattr(torchTextClassifiers, 'build') - assert hasattr(torchTextClassifiers, 'train') - assert hasattr(torchTextClassifiers, 'predict') - assert hasattr(torchTextClassifiers, 'validate') - assert hasattr(torchTextClassifiers, 'to_json') - assert hasattr(torchTextClassifiers, 'from_json') - - -def test_abstract_base_classes(): - """Test the abstract base class structure.""" - from torchTextClassifiers.classifiers.base import BaseClassifierConfig, BaseClassifierWrapper - - # Test that they are abstract - with pytest.raises(TypeError): - BaseClassifierConfig() - - # Test that BaseClassifierWrapper requires a config but still can't be instantiated - mock_config = Mock() - with pytest.raises(TypeError): - BaseClassifierWrapper(mock_config) - - -def test_utilities_import(): - """Test that utility functions can be imported.""" - from torchTextClassifiers.utilities.checkers import check_X, check_Y, NumpyJSONEncoder - - # Test basic functionality that doesn't depend on complex imports - assert callable(check_X) - assert callable(check_Y) - assert NumpyJSONEncoder is not None - - -def test_torchTextClassifiers_initialization_pattern(): - """Test the initialization pattern using mocks.""" - from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers - - # Mock wrapper with config - mock_wrapper = Mock() - mock_config = Mock() - mock_wrapper.config = mock_config - - # Create instance directly with wrapper - classifier = torchTextClassifiers(mock_wrapper) - - # Verify initialization - assert classifier.classifier == mock_wrapper - assert classifier.config == mock_config - - -def test_numpy_json_encoder(): - """Test the custom JSON encoder for numpy arrays.""" - from torchTextClassifiers.utilities.checkers import NumpyJSONEncoder - - # Test with numpy array - test_data = { - "array": np.array([1, 2, 3]), - "scalar": np.int64(42), - "regular": "string" - } - - # Should not raise an error - json_str = json.dumps(test_data, cls=NumpyJSONEncoder) - assert isinstance(json_str, str) - - # Verify it can be loaded back - loaded_data = json.loads(json_str) - assert loaded_data["regular"] == "string" - - -def test_create_fasttext_classmethod(): - """Test the create_fasttext class method through FastTextFactory.""" - from torchTextClassifiers.classifiers.fasttext.core import FastTextFactory - from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers - from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper - - # Just test that it creates a real instance and config properly - result = FastTextFactory.create_fasttext( - embedding_dim=50, - sparse=True, - num_tokens=5000, - min_count=2, - min_n=2, - max_n=5, - len_word_ngrams=3, - num_classes=2 - ) - - # Verify the result is a proper torchTextClassifiers instance - assert isinstance(result, torchTextClassifiers) - assert isinstance(result.classifier, FastTextWrapper) - assert result.config.embedding_dim == 50 - assert result.config.sparse == True - assert result.config.num_tokens == 5000 - - -def test_method_delegation_pattern(): - """Test that the main class properly delegates to wrapper methods.""" - from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers - - # Create a mock instance - classifier = Mock(spec=torchTextClassifiers) - classifier.classifier = Mock() - - # Test predict delegation - expected_result = np.array([1, 0, 1]) - classifier.classifier.predict.return_value = expected_result - - # Apply the real predict method to our mock - sample_X = np.array(["test1", "test2", "test3"]) - result = torchTextClassifiers.predict(classifier, sample_X) - - classifier.classifier.predict.assert_called_once_with(sample_X) - assert result is expected_result - - -def test_error_handling_patterns(): - """Test expected error handling without actual implementation.""" - - # Test that incomplete wrapper configurations raise appropriate errors - from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers - - # Test with invalid wrapper (missing config attribute) - class InvalidWrapper: - pass - - invalid_wrapper = InvalidWrapper() - - # This should raise AttributeError for missing config - with pytest.raises(AttributeError): - torchTextClassifiers(invalid_wrapper) - - -@pytest.mark.parametrize("method_name,expected_args", [ - ("predict", ["X"]), - ("validate", ["X", "Y"]), - ("prepare_text_features", ["training_text"]), -]) -def test_wrapper_method_signatures(method_name, expected_args): - """Test that wrapper methods have expected signatures.""" - from torchTextClassifiers.classifiers.base import BaseClassifierWrapper - - # Get the method from the abstract class - method = getattr(BaseClassifierWrapper, method_name) - - # Check that it's abstract - assert hasattr(method, '__isabstractmethod__') - assert method.__isabstractmethod__ == True - - -def test_configuration_serialization_pattern(): - """Test the configuration serialization pattern.""" - from torchTextClassifiers.classifiers.base import BaseClassifierConfig - - # Verify abstract methods exist - assert hasattr(BaseClassifierConfig, 'to_dict') - assert hasattr(BaseClassifierConfig, 'from_dict') - - # Verify they are abstract - assert BaseClassifierConfig.to_dict.__isabstractmethod__ == True - assert BaseClassifierConfig.from_dict.__isabstractmethod__ == True - - -def test_sample_data_fixtures(sample_text_data, sample_labels, sample_categorical_data): - """Test that our test fixtures work correctly.""" - assert len(sample_text_data) == 6 - assert len(sample_labels) == 6 - assert sample_categorical_data.shape == (6, 2) - - # Verify data types - assert isinstance(sample_text_data, np.ndarray) - assert isinstance(sample_labels, np.ndarray) - assert isinstance(sample_categorical_data, np.ndarray) - - # Verify content makes sense - assert all(isinstance(text, str) for text in sample_text_data) - assert all(label in [0, 1] for label in sample_labels) - - diff --git a/tests/test_fasttext_implementation.py b/tests/test_fasttext_implementation.py deleted file mode 100644 index 770ae83..0000000 --- a/tests/test_fasttext_implementation.py +++ /dev/null @@ -1,445 +0,0 @@ -import pytest -import numpy as np -import torch -from unittest.mock import Mock, patch, MagicMock - -from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper -from torchTextClassifiers.classifiers.fasttext.core import FastTextConfig -from torchTextClassifiers.classifiers.fasttext.tokenizer import NGramTokenizer -from torchTextClassifiers.classifiers.fasttext.model import FastTextModelDataset, FastTextModel, FastTextModule - - -class TestFastTextConfig: - """Test FastTextConfig class.""" - - def test_config_creation(self): - """Test basic config creation.""" - config = FastTextConfig( - embedding_dim=100, - sparse=True, - num_tokens=5000, - min_count=2, - min_n=3, - max_n=6, - len_word_ngrams=2, - num_classes=3 - ) - - assert config.embedding_dim == 100 - assert config.sparse == True - assert config.num_tokens == 5000 - assert config.min_count == 2 - assert config.min_n == 3 - assert config.max_n == 6 - assert config.len_word_ngrams == 2 - assert config.num_classes == 3 - - def test_config_to_dict(self): - """Test config serialization to dictionary.""" - config = FastTextConfig( - embedding_dim=100, - sparse=False, - num_tokens=1000, - min_count=1, - min_n=3, - max_n=6, - len_word_ngrams=2 - ) - - config_dict = config.to_dict() - - assert isinstance(config_dict, dict) - assert config_dict['embedding_dim'] == 100 - assert config_dict['sparse'] == False - assert config_dict['num_tokens'] == 1000 - - def test_config_from_dict(self): - """Test config deserialization from dictionary.""" - config_dict = { - 'embedding_dim': 50, - 'sparse': True, - 'num_tokens': 2000, - 'min_count': 3, - 'min_n': 2, - 'max_n': 5, - 'len_word_ngrams': 3, - 'num_classes': 4 - } - - config = FastTextConfig.from_dict(config_dict) - - assert config.embedding_dim == 50 - assert config.sparse == True - assert config.num_tokens == 2000 - assert config.num_classes == 4 - - -class TestFastTextWrapper: - """Test FastTextWrapper class.""" - - def test_wrapper_initialization(self, fasttext_config): - """Test wrapper initialization.""" - wrapper = FastTextWrapper(fasttext_config) - - assert wrapper.config == fasttext_config - assert wrapper.tokenizer is None - assert wrapper.pytorch_model is None - assert wrapper.lightning_module is None - assert wrapper.trained == False - assert wrapper.device is None - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.NGramTokenizer') - def test_build_tokenizer(self, mock_tokenizer_class, fasttext_config, sample_text_data): - """Test tokenizer building.""" - mock_tokenizer = Mock() - mock_tokenizer_class.return_value = mock_tokenizer - - wrapper = FastTextWrapper(fasttext_config) - wrapper.build_tokenizer(sample_text_data) - - mock_tokenizer_class.assert_called_once_with( - fasttext_config.min_count, - fasttext_config.min_n, - fasttext_config.max_n, - fasttext_config.num_tokens, - fasttext_config.len_word_ngrams, - sample_text_data - ) - assert wrapper.tokenizer == mock_tokenizer - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextModel') - def test_build_pytorch_model_with_tokenizer(self, mock_model_class, fasttext_config, mock_tokenizer): - """Test PyTorch model building with existing tokenizer.""" - mock_model = Mock() - mock_model_class.return_value = mock_model - - wrapper = FastTextWrapper(fasttext_config) - wrapper.tokenizer = mock_tokenizer - - wrapper._build_pytorch_model() - - # Verify model was created with correct parameters - mock_model_class.assert_called_once() - call_kwargs = mock_model_class.call_args[1] - assert call_kwargs['tokenizer'] == mock_tokenizer - assert call_kwargs['embedding_dim'] == fasttext_config.embedding_dim - assert call_kwargs['num_classes'] == fasttext_config.num_classes - - assert wrapper.pytorch_model == mock_model - - def test_build_pytorch_model_no_tokenizer_no_num_rows(self, fasttext_config): - """Test PyTorch model building fails without tokenizer or num_rows.""" - wrapper = FastTextWrapper(fasttext_config) - # No tokenizer and no num_rows in config - wrapper.tokenizer = None - wrapper.config.num_rows = None - - with pytest.raises(ValueError, match="Please provide a tokenizer or num_rows"): - wrapper._build_pytorch_model() - - @patch('torch.optim.lr_scheduler.ReduceLROnPlateau') - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextModule') - def test_check_and_init_lightning_basic(self, mock_module_class, mock_scheduler, fasttext_config, mock_pytorch_model): - """Test Lightning module initialization.""" - mock_module = Mock() - mock_module_class.return_value = mock_module - - wrapper = FastTextWrapper(fasttext_config) - wrapper.pytorch_model = mock_pytorch_model - - wrapper._check_and_init_lightning(lr=0.01) - - # Verify Lightning module was created - mock_module_class.assert_called_once() - assert wrapper.lightning_module == mock_module - assert wrapper.optimizer_params == {"lr": 0.01} - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextModule') - def test_check_and_init_lightning_uses_config_lr(self, mock_module_class, fasttext_config, mock_pytorch_model): - """Test Lightning module initialization uses config learning rate as default.""" - wrapper = FastTextWrapper(fasttext_config) - wrapper.pytorch_model = mock_pytorch_model - mock_module = Mock() - mock_module_class.return_value = mock_module - - # Should not raise an error since learning_rate is in config - wrapper._check_and_init_lightning() - - # Check that the learning rate from config was used - assert wrapper.optimizer_params['lr'] == fasttext_config.learning_rate - assert wrapper.lightning_module == mock_module - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.check_X') - def test_predict_not_trained(self, mock_check_X, fasttext_config, sample_text_data): - """Test prediction fails when model not trained.""" - wrapper = FastTextWrapper(fasttext_config) - wrapper.trained = False - - with pytest.raises(Exception, match="Model must be trained first"): - wrapper.predict(sample_text_data) - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.check_X') - def test_predict_success(self, mock_check_X, fasttext_config, sample_text_data, mock_pytorch_model): - """Test successful prediction.""" - mock_check_X.return_value = (sample_text_data, None, True) - expected_predictions = np.array([[1], [0], [1]]) # With top_k dimension - expected_confidence = np.array([[0.9], [0.8], [0.95]]) - mock_pytorch_model.predict.return_value = (expected_predictions, expected_confidence) - mock_pytorch_model.no_cat_var = True - - wrapper = FastTextWrapper(fasttext_config) - wrapper.trained = True - wrapper.pytorch_model = mock_pytorch_model - wrapper.config.num_categorical_features = None - - result = wrapper.predict(sample_text_data) - - mock_pytorch_model.predict.assert_called_once() - # The wrapper should squeeze the top_k dimension for top_k=1 - expected_result = np.array([1, 0, 1]) - np.testing.assert_array_equal(result, expected_result) - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextWrapper.predict') - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.check_Y') - def test_validate_success(self, mock_check_Y, mock_predict, fasttext_config, - sample_text_data, sample_labels): - """Test successful validation.""" - mock_predictions = np.array([1, 0, 1]) - mock_predict.return_value = mock_predictions - mock_check_Y.return_value = np.array([1, 0, 1]) # Perfect predictions - - wrapper = FastTextWrapper(fasttext_config) - wrapper.trained = True - - result = wrapper.validate(sample_text_data, sample_labels) - - mock_predict.assert_called_once_with(sample_text_data) - mock_check_Y.assert_called_once_with(sample_labels) - assert result == 1.0 # Perfect accuracy - - def test_create_dataset(self, fasttext_config, sample_text_data, sample_labels, mock_tokenizer): - """Test dataset creation.""" - wrapper = FastTextWrapper(fasttext_config) - wrapper.tokenizer = mock_tokenizer - - with patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextModelDataset') as mock_dataset_class: - mock_dataset = Mock() - mock_dataset_class.return_value = mock_dataset - - result = wrapper.create_dataset(sample_text_data, sample_labels) - - mock_dataset_class.assert_called_once_with( - categorical_variables=None, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - assert result == mock_dataset - - def test_create_dataset_with_categorical(self, fasttext_config, sample_text_data, sample_labels, - sample_categorical_data, mock_tokenizer): - """Test dataset creation with categorical variables.""" - wrapper = FastTextWrapper(fasttext_config) - wrapper.tokenizer = mock_tokenizer - - with patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextModelDataset') as mock_dataset_class: - mock_dataset = Mock() - mock_dataset_class.return_value = mock_dataset - - result = wrapper.create_dataset(sample_text_data, sample_labels, sample_categorical_data) - - mock_dataset_class.assert_called_once_with( - categorical_variables=sample_categorical_data, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - assert result == mock_dataset - - def test_create_dataloader(self, fasttext_config, mock_dataset): - """Test dataloader creation.""" - mock_dataloader = Mock() - mock_dataset.create_dataloader.return_value = mock_dataloader - - wrapper = FastTextWrapper(fasttext_config) - - result = wrapper.create_dataloader(mock_dataset, batch_size=32, num_workers=4, shuffle=True) - - mock_dataset.create_dataloader.assert_called_once_with( - batch_size=32, num_workers=4, shuffle=True - ) - assert result == mock_dataloader - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.FastTextModule') - def test_load_best_model(self, mock_module_class, fasttext_config, mock_pytorch_model): - """Test loading best model from checkpoint.""" - mock_loaded_module = Mock() - mock_loaded_module.model = mock_pytorch_model - mock_module_class.load_from_checkpoint.return_value = mock_loaded_module - - wrapper = FastTextWrapper(fasttext_config) - wrapper.pytorch_model = mock_pytorch_model - wrapper.loss = Mock() - wrapper.optimizer = Mock() - wrapper.optimizer_params = {} - wrapper.scheduler = Mock() - wrapper.scheduler_params = {} - - mock_pytorch_model.to.return_value = mock_pytorch_model - mock_pytorch_model.eval = Mock() - - checkpoint_path = "/fake/checkpoint/path" - wrapper.load_best_model(checkpoint_path) - - # Verify checkpoint loading - mock_module_class.load_from_checkpoint.assert_called_once_with( - checkpoint_path, - model=mock_pytorch_model, - loss=wrapper.loss, - optimizer=wrapper.optimizer, - optimizer_params=wrapper.optimizer_params, - scheduler=wrapper.scheduler, - scheduler_params=wrapper.scheduler_params, - scheduler_interval="epoch" - ) - - # Verify model state updates - assert wrapper.lightning_module == mock_loaded_module - assert wrapper.pytorch_model == mock_pytorch_model - assert wrapper.trained == True - mock_pytorch_model.eval.assert_called_once() - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.check_X') - def test_predict_and_explain_success(self, mock_check_X, fasttext_config, sample_text_data, mock_pytorch_model): - """Test successful predict_and_explain.""" - mock_check_X.return_value = (sample_text_data, None, True) - expected_result = (np.array([1, 0, 1]), np.array([0.8, 0.2, 0.9])) - mock_pytorch_model.predict_and_explain.return_value = expected_result - mock_pytorch_model.no_cat_var = True - - wrapper = FastTextWrapper(fasttext_config) - wrapper.trained = True - wrapper.pytorch_model = mock_pytorch_model - wrapper.config.num_categorical_features = None - - result = wrapper.predict_and_explain(sample_text_data) - - mock_pytorch_model.predict_and_explain.assert_called_once() - assert result == expected_result - - @patch('torchTextClassifiers.classifiers.fasttext.wrapper.check_X') - def test_predict_and_explain_not_trained(self, mock_check_X, fasttext_config, sample_text_data): - """Test predict_and_explain fails when model not trained.""" - wrapper = FastTextWrapper(fasttext_config) - wrapper.trained = False - - with pytest.raises(Exception, match="Model must be trained first"): - wrapper.predict_and_explain(sample_text_data) - - -class TestFastTextModelDataset: - """Test FastTextModelDataset class.""" - - def test_dataset_initialization_text_only(self, sample_text_data, sample_labels, mock_tokenizer): - """Test dataset initialization with text only.""" - dataset = FastTextModelDataset( - categorical_variables=None, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - - assert len(dataset) == len(sample_text_data) - assert dataset.texts is sample_text_data - assert dataset.outputs is sample_labels - assert dataset.tokenizer is mock_tokenizer - assert dataset.categorical_variables is None - - def test_dataset_initialization_with_categorical(self, sample_text_data, sample_labels, - sample_categorical_data, mock_tokenizer): - """Test dataset initialization with categorical variables.""" - dataset = FastTextModelDataset( - categorical_variables=sample_categorical_data, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - - assert len(dataset) == len(sample_text_data) - assert dataset.categorical_variables is sample_categorical_data - - def test_dataset_length_mismatch_categorical(self, sample_text_data, sample_labels, mock_tokenizer): - """Test dataset initialization fails with mismatched lengths.""" - wrong_categorical = np.array([[1, 2]]) # Wrong length - - with pytest.raises(ValueError, match="Categorical variables and texts must have the same length"): - FastTextModelDataset( - categorical_variables=wrong_categorical, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - - def test_dataset_length_mismatch_outputs(self, sample_text_data, mock_tokenizer): - """Test dataset initialization fails with mismatched output lengths.""" - wrong_outputs = np.array([1, 0]) # Wrong length - - with pytest.raises(ValueError, match="Outputs and texts must have the same length"): - FastTextModelDataset( - categorical_variables=None, - texts=sample_text_data, - outputs=wrong_outputs, - tokenizer=mock_tokenizer - ) - - def test_dataset_getitem_with_outputs(self, sample_text_data, sample_labels, mock_tokenizer): - """Test dataset __getitem__ with outputs.""" - dataset = FastTextModelDataset( - categorical_variables=None, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - - text, categorical_vars, output = dataset[0] - - assert text == sample_text_data[0] - assert categorical_vars is None - assert output == sample_labels[0] - - def test_dataset_getitem_without_outputs(self, sample_text_data, mock_tokenizer): - """Test dataset __getitem__ without outputs.""" - dataset = FastTextModelDataset( - categorical_variables=None, - texts=sample_text_data, - outputs=None, - tokenizer=mock_tokenizer - ) - - text, categorical_vars = dataset[0] - - assert text == sample_text_data[0] - assert categorical_vars is None - - @patch('torch.utils.data.DataLoader') - def test_create_dataloader(self, mock_dataloader_class, sample_text_data, sample_labels, mock_tokenizer): - """Test dataloader creation.""" - mock_dataloader = Mock() - mock_dataloader_class.return_value = mock_dataloader - - dataset = FastTextModelDataset( - categorical_variables=None, - texts=sample_text_data, - outputs=sample_labels, - tokenizer=mock_tokenizer - ) - - result = dataset.create_dataloader(batch_size=16, shuffle=True, num_workers=2) - - mock_dataloader_class.assert_called_once() - call_kwargs = mock_dataloader_class.call_args[1] - assert call_kwargs['batch_size'] == 16 - assert call_kwargs['shuffle'] == True - assert call_kwargs['num_workers'] == 2 - - assert result == mock_dataloader \ No newline at end of file diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..b272acc --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,236 @@ +import numpy as np +import pytest +import torch + +from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers +from torchTextClassifiers.dataset import TextClassificationDataset +from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule +from torchTextClassifiers.model.components import ( + AttentionConfig, + CategoricalVariableNet, + ClassificationHead, + TextEmbedder, + TextEmbedderConfig, +) +from torchTextClassifiers.tokenizers import HuggingFaceTokenizer, NGramTokenizer, WordPieceTokenizer +from torchTextClassifiers.utilities.plot_explainability import ( + map_attributions_to_char, + map_attributions_to_word, + plot_attributions_at_char, + plot_attributions_at_word, +) + + +@pytest.fixture +def sample_data(): + """Fixture providing sample data for all tests.""" + sample_text_data = [ + "This is a positive example", + "This is a negative example", + "Another positive case", + "Another negative case", + "Good example here", + "Bad example here", + ] + categorical_data = np.array([[1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1]]).astype(int) + labels = np.array([1, 0, 1, 0, 1, 5]) + + return sample_text_data, categorical_data, labels + + +@pytest.fixture +def model_params(): + """Fixture providing common model parameters.""" + return { + "embedding_dim": 96, + "n_layers": 2, + "n_head": 4, + "num_classes": 10, + "categorical_vocab_sizes": [2, 2], + "categorical_embedding_dims": [4, 7], + } + + +def run_full_pipeline(tokenizer, sample_text_data, categorical_data, labels, model_params): + """Helper function to run the complete pipeline for a given tokenizer.""" + # Create dataset + dataset = TextClassificationDataset( + texts=sample_text_data, + categorical_variables=categorical_data.tolist(), + tokenizer=tokenizer, + labels=None, + ) + + dataloader = dataset.create_dataloader(batch_size=4) + batch = next(iter(dataloader)) + + # Get tokenizer parameters + vocab_size = tokenizer.vocab_size + padding_idx = tokenizer.padding_idx + sequence_len = tokenizer.output_dim + + # Create attention config + attention_config = AttentionConfig( + n_layers=model_params["n_layers"], + n_head=model_params["n_head"], + n_kv_head=model_params["n_head"], + sequence_len=sequence_len, + ) + + # Create text embedder + text_embedder_config = TextEmbedderConfig( + vocab_size=vocab_size, + embedding_dim=model_params["embedding_dim"], + padding_idx=padding_idx, + attention_config=attention_config, + ) + + text_embedder = TextEmbedder(text_embedder_config=text_embedder_config) + text_embedder.init_weights() + + # Create categorical variable net + categorical_var_net = CategoricalVariableNet( + categorical_vocabulary_sizes=model_params["categorical_vocab_sizes"], + categorical_embedding_dims=model_params["categorical_embedding_dims"], + ) + + # Create classification head + expected_input_dim = model_params["embedding_dim"] + categorical_var_net.output_dim + classification_head = ClassificationHead( + input_dim=expected_input_dim, + num_classes=model_params["num_classes"], + ) + + # Create model + model = TextClassificationModel( + text_embedder=text_embedder, + categorical_variable_net=categorical_var_net, + classification_head=classification_head, + ) + + # Test forward pass + model(**batch) + + # Create module + module = TextClassificationModule( + model=model, + loss=torch.nn.CrossEntropyLoss(), + optimizer=torch.optim.Adam, + optimizer_params={"lr": 1e-3}, + scheduler=None, + scheduler_params=None, + scheduler_interval="epoch", + ) + + # Test prediction + module.predict_step(batch) + + # Prepare data for training + X = np.column_stack([sample_text_data, categorical_data]) + Y = labels + + # Create model config + model_config = ModelConfig( + embedding_dim=model_params["embedding_dim"], + categorical_vocabulary_sizes=model_params["categorical_vocab_sizes"], + categorical_embedding_dims=model_params["categorical_embedding_dims"], + num_classes=model_params["num_classes"], + attention_config=attention_config, + ) + + # Create training config + training_config = TrainingConfig( + lr=1e-3, + batch_size=4, + num_epochs=1, + ) + + # Create classifier + ttc = torchTextClassifiers( + tokenizer=tokenizer, + model_config=model_config, + ) + + # Train + ttc.train( + X_train=X, + y_train=Y, + X_val=X, + y_val=Y, + training_config=training_config, + ) + + # Predict with explanations + top_k = 5 + predictions = ttc.predict(X, top_k=top_k, explain=True) + + # Test explainability functions + text_idx = 0 + text = sample_text_data[text_idx] + offsets = predictions["offset_mapping"][text_idx] + attributions = predictions["attributions"][text_idx] + word_ids = predictions["word_ids"][text_idx] + + word_attributions = map_attributions_to_word(attributions, word_ids) + char_attributions = map_attributions_to_char(attributions, offsets, text) + + # Note: We're not actually plotting in tests, just calling the functions + # to ensure they don't raise errors + plot_attributions_at_char(text, char_attributions) + plot_attributions_at_word(text, word_attributions) + + +def test_wordpiece_tokenizer(sample_data, model_params): + """Test the full pipeline with WordPieceTokenizer.""" + sample_text_data, categorical_data, labels = sample_data + + vocab_size = 100 + tokenizer = WordPieceTokenizer(vocab_size, output_dim=50) + tokenizer.train(sample_text_data) + + # Check tokenizer works + result = tokenizer.tokenize(sample_text_data) + assert result.input_ids.shape[0] == len(sample_text_data) + + # Run full pipeline + run_full_pipeline(tokenizer, sample_text_data, categorical_data, labels, model_params) + + +def test_huggingface_tokenizer(sample_data, model_params): + """Test the full pipeline with HuggingFaceTokenizer.""" + sample_text_data, categorical_data, labels = sample_data + + tokenizer = HuggingFaceTokenizer.load_from_pretrained( + "google-bert/bert-base-uncased", output_dim=50 + ) + + # Check tokenizer works + result = tokenizer.tokenize(sample_text_data) + assert result.input_ids.shape[0] == len(sample_text_data) + + # Run full pipeline + run_full_pipeline(tokenizer, sample_text_data, categorical_data, labels, model_params) + + +def test_ngram_tokenizer(sample_data, model_params): + """Test the full pipeline with NGramTokenizer.""" + sample_text_data, categorical_data, labels = sample_data + + tokenizer = NGramTokenizer( + min_count=3, min_n=2, max_n=5, num_tokens=100, len_word_ngrams=2, output_dim=76 + ) + tokenizer.train(sample_text_data) + + # Check tokenizer works + result = tokenizer.tokenize( + sample_text_data[0], return_offsets_mapping=True, return_word_ids=True + ) + assert result.input_ids is not None + + # Check batch decode + batch_result = tokenizer.tokenize(sample_text_data) + decoded = tokenizer.batch_decode(batch_result.input_ids.tolist()) + assert len(decoded) == len(sample_text_data) + + # Run full pipeline + run_full_pipeline(tokenizer, sample_text_data, categorical_data, labels, model_params) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py new file mode 100644 index 0000000..fd5b9b1 --- /dev/null +++ b/tests/test_tokenizer.py @@ -0,0 +1,124 @@ +""" +tests/test_tokenizer_benchmarks.py + +Pytest integration for tokenizer benchmarks. +Run with: pytest tests/test_tokenizer_benchmarks.py --benchmark +""" + +from pathlib import Path + +import pytest + +from tests.benchmark_suite import ( + compare_tokenizers, + generate_test_data, + plot_comparison, +) +from torchTextClassifiers.tokenizers.ngram import NGramTokenizer +from torchTextClassifiers.tokenizers.WordPiece import WordPieceTokenizer + + +@pytest.fixture(scope="module") +def training_data(): + """Generate training data once for all tests.""" + return generate_test_data(1000, avg_length=30) + + +@pytest.fixture(scope="module") +def ngram_tokenizer(training_data): + """Create and train NGram tokenizer.""" + tokenizer = NGramTokenizer( + min_count=2, + min_n=2, + max_n=4, + num_tokens=10000, + len_word_ngrams=2, + training_text=training_data, + ) + return tokenizer + + +@pytest.fixture(scope="module") +def wordpiece_tokenizer(training_data): + """Create and train WordPiece tokenizer.""" + wp = WordPieceTokenizer(vocab_size=10000) + wp.train(training_corpus=training_data) + return wp + + +# ============================================================================ +# Regular Tests (Always Run) +# ============================================================================ + + +def test_ngram_tokenizer_basic(ngram_tokenizer): + """Basic sanity test for NGram tokenizer.""" + test_text = ["hello world", "machine learning is awesome"] + result = ngram_tokenizer.tokenize(test_text) + + assert result.input_ids is not None + assert result.attention_mask is not None + assert result.input_ids.shape[0] == len(test_text) + + +def test_wordpiece_tokenizer_basic(wordpiece_tokenizer): + """Basic sanity test for WordPiece tokenizer.""" + test_text = ["hello world", "machine learning is awesome"] + result = wordpiece_tokenizer.tokenize(test_text) + + assert result.input_ids is not None + assert result.attention_mask is not None + assert result.input_ids.shape[0] == len(test_text) + + +# ============================================================================ +# Benchmark Tests (Run with --benchmark flag) +# ============================================================================ + + +def test_tokenizer_comparison_small(ngram_tokenizer, wordpiece_tokenizer): + """Compare tokenizers on small batch (CI-friendly).""" + tokenizers = { + "NGram": ngram_tokenizer, + "WordPiece": wordpiece_tokenizer, + } + + # Small batch sizes for CI + results = compare_tokenizers(tokenizers, batch_sizes=[100, 500]) + + # Ensure results were generated + assert len(results) == 2 + for name, data in results.items(): + assert len(data) > 0, f"{name} produced no results" + + +def test_tokenizer_comparison_full(ngram_tokenizer, wordpiece_tokenizer): + """Full benchmark comparison (for local testing).""" + tokenizers = { + "NGram": ngram_tokenizer, + "WordPiece": wordpiece_tokenizer, + } + + # Full benchmark + results = compare_tokenizers(tokenizers, batch_sizes=[100, 500, 1000]) + + # Save results + output_dir = Path("benchmark_results") + output_dir.mkdir(exist_ok=True) + + # Save plot + plot_comparison(results, save_path=str(output_dir / "comparison.png")) + + # Save JSON results + results_json = {} + for name, data in results.items(): + results_json[name] = [ + { + "batch_size": d["throughput"] / d["time"] * 1000, + "time": d["time"], + "throughput": d["throughput"], + } + for d in data + ] + + print(f"\n✓ Results: {results_json}/") diff --git a/tests/test_torchTextClassifiers.py b/tests/test_torchTextClassifiers.py index b80ee21..6ecf388 100644 --- a/tests/test_torchTextClassifiers.py +++ b/tests/test_torchTextClassifiers.py @@ -1,353 +1,17 @@ -import pytest -import numpy as np -import json -from unittest.mock import Mock, patch, MagicMock -import tempfile -import os - -from torchTextClassifiers.torchTextClassifiers import torchTextClassifiers -from torchTextClassifiers.classifiers.fasttext.core import FastTextConfig, FastTextFactory -from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper - - +from torchTextClassifiers import torchTextClassifiers +from torchTextClassifiers.model import TextClassificationModel +from torchTextClassifiers.model.components import ClassificationHead class TestTorchTextClassifiers: """Test the main torchTextClassifiers class.""" - - def test_initialization(self, fasttext_config): + + def test_initialization(self, model_config, mock_tokenizer): """Test basic initialization.""" - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - - assert classifier.config == fasttext_config - assert isinstance(classifier.classifier, FastTextWrapper) - assert classifier.classifier is wrapper - - def test_create_fasttext_classmethod(self): - """Test the create_fasttext class method.""" - classifier = FastTextFactory.create_fasttext( - embedding_dim=50, - sparse=True, - num_tokens=5000, - min_count=2, - min_n=2, - max_n=5, - len_word_ngrams=3, - num_classes=3 - ) - - assert isinstance(classifier.classifier, FastTextWrapper) - assert classifier.config.embedding_dim == 50 - assert classifier.config.sparse == True - assert classifier.config.num_tokens == 5000 - assert classifier.config.num_classes == 3 - - def test_build_from_tokenizer(self, mock_tokenizer): - """Test building classifier from existing tokenizer.""" - classifier = FastTextFactory.build_from_tokenizer( + ttc = torchTextClassifiers( tokenizer=mock_tokenizer, - embedding_dim=100, - num_classes=2, - sparse=False + model_config=model_config, ) - - assert isinstance(classifier.classifier, FastTextWrapper) - assert classifier.config.embedding_dim == 100 - assert classifier.config.num_classes == 2 - assert classifier.classifier.tokenizer == mock_tokenizer - - def test_build_from_tokenizer_missing_attributes(self): - """Test build_from_tokenizer with tokenizer missing attributes.""" - class IncompleteTokenizer: - def __init__(self): - self.min_count = 1 - # Missing: min_n, max_n, num_tokens, word_ngrams - - incomplete_tokenizer = IncompleteTokenizer() - - with pytest.raises(ValueError, match="Missing attributes in tokenizer"): - FastTextFactory.build_from_tokenizer( - tokenizer=incomplete_tokenizer, - embedding_dim=100, - num_classes=2 - ) - - - @patch('torchTextClassifiers.torchTextClassifiers.check_X') - def test_build_tokenizer(self, mock_check_X, fasttext_config, sample_text_data): - """Test build_tokenizer method.""" - mock_check_X.return_value = (sample_text_data, None, True) - - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier.prepare_text_features = Mock() - - classifier.build_tokenizer(sample_text_data) - - classifier.classifier.prepare_text_features.assert_called_once_with(sample_text_data) - - @patch('torchTextClassifiers.torchTextClassifiers.check_X') - @patch('torchTextClassifiers.torchTextClassifiers.check_Y') - def test_build_method_with_labels(self, mock_check_Y, mock_check_X, fasttext_config, - sample_text_data, sample_labels): - """Test build method with training labels.""" - mock_check_X.return_value = (sample_text_data, None, True) - mock_check_Y.return_value = sample_labels - - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier.prepare_text_features = Mock() - classifier.classifier._build_pytorch_model = Mock() - classifier.classifier._check_and_init_lightning = Mock() - - classifier.build(sample_text_data, sample_labels) - - # Verify methods were called - classifier.classifier.prepare_text_features.assert_called_once() - classifier.classifier._build_pytorch_model.assert_called_once() - classifier.classifier._check_and_init_lightning.assert_called_once() - - # Verify num_classes was updated - assert classifier.config.num_classes == len(np.unique(sample_labels)) - - @patch('torchTextClassifiers.torchTextClassifiers.check_X') - def test_build_method_without_labels(self, mock_check_X, sample_text_data): - """Test build method without training labels.""" - mock_check_X.return_value = (sample_text_data, None, True) - - # Config with pre-set num_classes - config = FastTextConfig( - embedding_dim=10, sparse=False, num_tokens=1000, - min_count=1, min_n=3, max_n=6, len_word_ngrams=2, - num_classes=3 # Pre-set - ) - - wrapper = FastTextWrapper(config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier.prepare_text_features = Mock() - classifier.classifier._build_pytorch_model = Mock() - classifier.classifier._check_and_init_lightning = Mock() - - classifier.build(sample_text_data, y_train=None) - - # Should not raise error since num_classes is pre-set - assert classifier.config.num_classes == 3 - - @patch('torchTextClassifiers.torchTextClassifiers.check_X') - def test_build_method_no_labels_no_num_classes(self, mock_check_X, fasttext_config, sample_text_data): - """Test build method fails when no labels and no num_classes.""" - mock_check_X.return_value = (sample_text_data, None, True) - - # Config without num_classes - fasttext_config.num_classes = None - - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - - with pytest.raises(ValueError, match="Either num_classes must be provided"): - classifier.build(sample_text_data, y_train=None) - - @patch('torchTextClassifiers.torchTextClassifiers.check_X') - @patch('torchTextClassifiers.torchTextClassifiers.check_Y') - def test_build_invalid_labels_range(self, mock_check_Y, mock_check_X, fasttext_config, - sample_text_data): - """Test build method with invalid label range.""" - mock_check_X.return_value = (sample_text_data, None, True) - # Labels with values that don't start from 0 or have gaps (invalid) - invalid_labels = np.array([0, 1, 5]) # Max value 5 but only 3 unique values, so num_classes=3 but max=5 - mock_check_Y.return_value = invalid_labels - - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - - with pytest.raises(ValueError, match="y_train must contain values between 0 and num_classes-1"): - classifier.build(sample_text_data, invalid_labels) - - @patch('torchTextClassifiers.torchTextClassifiers.check_X') - @patch('torchTextClassifiers.torchTextClassifiers.check_Y') - @patch('torch.cuda.is_available') - @patch('pytorch_lightning.Trainer') - def test_train_method_basic(self, mock_trainer_class, mock_cuda, mock_check_Y, mock_check_X, - fasttext_config, sample_text_data, sample_labels, mock_dataset, mock_dataloader): - """Test basic train method functionality.""" - # Setup mocks - mock_check_X.return_value = (sample_text_data, None, True) - mock_check_Y.return_value = sample_labels - mock_cuda.return_value = True - - mock_trainer = Mock() - mock_trainer.checkpoint_callback.best_model_path = "/fake/path" - mock_trainer_class.return_value = mock_trainer - - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - - # Mock wrapper methods - classifier.classifier.create_dataset = Mock(return_value=mock_dataset) - classifier.classifier.create_dataloader = Mock(return_value=mock_dataloader) - classifier.classifier.load_best_model = Mock() - classifier.classifier.tokenizer = Mock() # Pretend it's built - classifier.classifier.pytorch_model = Mock() - classifier.classifier.pytorch_model.to = Mock(return_value=classifier.classifier.pytorch_model) - classifier.classifier.lightning_module = Mock() - - # Call train - classifier.train( - X_train=sample_text_data, - y_train=sample_labels, - X_val=sample_text_data[:3], - y_val=sample_labels[:3], - num_epochs=1, - batch_size=2 - ) - - # Verify dataset creation - assert classifier.classifier.create_dataset.call_count == 2 # train + val - assert classifier.classifier.create_dataloader.call_count == 2 # train + val - - # Verify trainer was called - mock_trainer.fit.assert_called_once() - classifier.classifier.load_best_model.assert_called_once() - - def test_predict_method(self, fasttext_config, sample_text_data): - """Test predict method.""" - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier.predict = Mock(return_value=np.array([1, 0, 1])) - - result = classifier.predict(sample_text_data) - - classifier.classifier.predict.assert_called_once_with(sample_text_data) - np.testing.assert_array_equal(result, np.array([1, 0, 1])) - - def test_validate_method(self, fasttext_config, sample_text_data, sample_labels): - """Test validate method.""" - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier.validate = Mock(return_value=0.85) - - result = classifier.validate(sample_text_data, sample_labels) - - classifier.classifier.validate.assert_called_once_with(sample_text_data, sample_labels) - assert result == 0.85 - - def test_predict_and_explain_method(self, fasttext_config, sample_text_data): - """Test predict_and_explain method.""" - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - expected_predictions = np.array([1, 0, 1]) - expected_explanations = np.array([0.8, 0.2, 0.9]) - classifier.classifier.predict_and_explain = Mock( - return_value=(expected_predictions, expected_explanations) - ) - - predictions, explanations = classifier.predict_and_explain(sample_text_data) - - classifier.classifier.predict_and_explain.assert_called_once_with(sample_text_data) - np.testing.assert_array_equal(predictions, expected_predictions) - np.testing.assert_array_equal(explanations, expected_explanations) - - def test_predict_and_explain_not_supported(self, fasttext_config, sample_text_data): - """Test predict_and_explain when not supported by wrapper.""" - - # Create a mock wrapper class that doesn't have predict_and_explain - class MockWrapperWithoutExplain: - pass - - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier = MockWrapperWithoutExplain() - - with pytest.raises(NotImplementedError, match="Explanation not supported"): - classifier.predict_and_explain(sample_text_data) - - def test_to_json_method(self, fasttext_config): - """Test to_json serialization method.""" - wrapper = FastTextWrapper(fasttext_config) - classifier = torchTextClassifiers(wrapper) - - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: - temp_path = f.name - - try: - classifier.to_json(temp_path) - - # Verify file was created and has correct content - assert os.path.exists(temp_path) - - with open(temp_path, 'r') as f: - data = json.load(f) - - assert 'wrapper_class_info' in data - assert 'config' in data - assert data['config']['embedding_dim'] == fasttext_config.embedding_dim - - finally: - if os.path.exists(temp_path): - os.unlink(temp_path) - - def test_from_json_method(self, fasttext_config): - """Test from_json deserialization method.""" - # First create a JSON file - wrapper = FastTextWrapper(fasttext_config) - original_classifier = torchTextClassifiers(wrapper) - - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: - temp_path = f.name - - try: - original_classifier.to_json(temp_path) - - # Load from JSON - loaded_classifier = torchTextClassifiers.from_json(temp_path) - - assert isinstance(loaded_classifier.classifier, FastTextWrapper) - assert loaded_classifier.config.embedding_dim == fasttext_config.embedding_dim - assert loaded_classifier.config.sparse == fasttext_config.sparse - assert loaded_classifier.config.num_tokens == fasttext_config.num_tokens - - finally: - if os.path.exists(temp_path): - os.unlink(temp_path) - - def test_from_json_missing_wrapper_info(self): - """Test from_json with missing wrapper class info.""" - # Create a JSON without wrapper_class_info - fake_data = { - "config": {"embedding_dim": 50, "sparse": False, "num_tokens": 1000, - "min_count": 1, "min_n": 3, "max_n": 6, "len_word_ngrams": 2} - } - - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: - json.dump(fake_data, f) - temp_path = f.name - - try: - with pytest.raises(ValueError, match="No wrapper_class_info found"): - torchTextClassifiers.from_json(temp_path) - finally: - if os.path.exists(temp_path): - os.unlink(temp_path) - - def test_from_json_with_explicit_wrapper_class(self, fasttext_config): - """Test from_json with explicitly provided wrapper class.""" - # First create a JSON file - wrapper = FastTextWrapper(fasttext_config) - original_classifier = torchTextClassifiers(wrapper) - - with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: - temp_path = f.name - - try: - original_classifier.to_json(temp_path) - - # Load from JSON with explicit wrapper class - loaded_classifier = torchTextClassifiers.from_json(temp_path, FastTextWrapper) - - assert isinstance(loaded_classifier.classifier, FastTextWrapper) - assert loaded_classifier.config.embedding_dim == fasttext_config.embedding_dim - - finally: - if os.path.exists(temp_path): - os.unlink(temp_path) \ No newline at end of file + + assert isinstance(ttc.pytorch_model, TextClassificationModel) + assert isinstance(ttc.classification_head, ClassificationHead) diff --git a/torchTextClassifiers/__init__.py b/torchTextClassifiers/__init__.py index 8e61d86..b8ce72b 100644 --- a/torchTextClassifiers/__init__.py +++ b/torchTextClassifiers/__init__.py @@ -11,58 +11,22 @@ - Extensible architecture for adding new classifier types - Support for both text-only and mixed text/categorical features -Quick Start: - >>> from torchTextClassifiers import create_fasttext - >>> import numpy as np - >>> - >>> # Create classifier - >>> classifier = create_fasttext( - ... embedding_dim=100, - ... sparse=False, - ... num_tokens=10000, - ... min_count=2, - ... min_n=3, - ... max_n=6, - ... len_word_ngrams=2, - ... num_classes=2 - ... ) - >>> - >>> # Prepare data - >>> X_train = np.array(["positive text", "negative text"]) - >>> y_train = np.array([1, 0]) - >>> X_val = np.array(["validation text"]) - >>> y_val = np.array([1]) - >>> - >>> # Build and train - >>> classifier.build(X_train, y_train) - >>> classifier.train(X_train, y_train, X_val, y_val, num_epochs=10, batch_size=32) - >>> - >>> # Predict - >>> predictions = classifier.predict(np.array(["new text sample"])) """ -from .torchTextClassifiers import torchTextClassifiers - -# Convenience imports for FastText -try: - from .classifiers.fasttext.core import FastTextFactory - - # Expose FastText convenience methods at package level for easy access - create_fasttext = FastTextFactory.create_fasttext - build_fasttext_from_tokenizer = FastTextFactory.build_from_tokenizer - -except ImportError: - # FastText module not available - define placeholder functions - def create_fasttext(*args, **kwargs): - raise ImportError("FastText module not available") - - def build_fasttext_from_tokenizer(*args, **kwargs): - raise ImportError("FastText module not available") +from .torchTextClassifiers import ( + ModelConfig as ModelConfig, +) +from .torchTextClassifiers import ( + TrainingConfig as TrainingConfig, +) +from .torchTextClassifiers import ( + torchTextClassifiers as torchTextClassifiers, +) __all__ = [ "torchTextClassifiers", - "create_fasttext", - "build_fasttext_from_tokenizer", + "ModelConfig", + "TrainingConfig", ] -__version__ = "1.0.0" \ No newline at end of file +__version__ = "1.0.0" diff --git a/torchTextClassifiers/classifiers/base.py b/torchTextClassifiers/classifiers/base.py deleted file mode 100644 index 193fa0a..0000000 --- a/torchTextClassifiers/classifiers/base.py +++ /dev/null @@ -1,83 +0,0 @@ -from typing import Optional, Union, Type, List, Dict, Any -from dataclasses import dataclass, field, asdict -from abc import ABC, abstractmethod -import numpy as np - -class BaseClassifierConfig(ABC): - """Abstract base class for classifier configurations.""" - - @abstractmethod - def to_dict(self) -> Dict[str, Any]: - """Convert configuration to dictionary.""" - pass - - @classmethod - @abstractmethod - def from_dict(cls, data: Dict[str, Any]) -> "BaseClassifierConfig": - """Create configuration from dictionary.""" - pass - -class BaseClassifierWrapper(ABC): - """Abstract base class for classifier wrappers. - - Each classifier wrapper is responsible for its own text processing approach. - Some may use tokenizers, others may use different preprocessing methods. - """ - - def __init__(self, config: BaseClassifierConfig): - self.config = config - self.pytorch_model = None - self.lightning_module = None - self.trained: bool = False - self.device = None - # Remove tokenizer from base class - it's now wrapper-specific - - @abstractmethod - def prepare_text_features(self, training_text: np.ndarray) -> None: - """Prepare text features for the classifier. - - This could involve tokenization, vectorization, or other preprocessing. - Each classifier wrapper implements this according to its needs. - """ - pass - - @abstractmethod - def _build_pytorch_model(self) -> None: - """Build the PyTorch model.""" - pass - - @abstractmethod - def _check_and_init_lightning(self, **kwargs) -> None: - """Initialize Lightning module.""" - pass - - @abstractmethod - def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: - """Make predictions.""" - pass - - @abstractmethod - def validate(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - """Validate the model.""" - pass - - @abstractmethod - def create_dataset(self, texts: np.ndarray, labels: np.ndarray, categorical_variables: Optional[np.ndarray] = None): - """Create dataset for training/validation.""" - pass - - @abstractmethod - def create_dataloader(self, dataset, batch_size: int, num_workers: int = 0, shuffle: bool = True): - """Create dataloader from dataset.""" - pass - - @abstractmethod - def load_best_model(self, checkpoint_path: str) -> None: - """Load best model from checkpoint.""" - pass - - @classmethod - @abstractmethod - def get_config_class(cls) -> Type[BaseClassifierConfig]: - """Return the configuration class for this wrapper.""" - pass \ No newline at end of file diff --git a/torchTextClassifiers/classifiers/fasttext/__init__.py b/torchTextClassifiers/classifiers/fasttext/__init__.py deleted file mode 100644 index c9da238..0000000 --- a/torchTextClassifiers/classifiers/fasttext/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -"""FastText classifier package. - -Provides FastText text classification with PyTorch Lightning integration. -This folder contains 4 main files: -- core.py: Configuration, losses, and factory methods -- tokenizer.py: NGramTokenizer implementation -- model.py: PyTorch model, Lightning module, and dataset -- wrapper.py: High-level wrapper interface -""" - -from .core import FastTextConfig, OneVsAllLoss, FastTextFactory -from .tokenizer import NGramTokenizer -from .model import FastTextModel, FastTextModule, FastTextModelDataset -from .wrapper import FastTextWrapper - -__all__ = [ - "FastTextConfig", - "OneVsAllLoss", - "FastTextFactory", - "NGramTokenizer", - "FastTextModel", - "FastTextModule", - "FastTextModelDataset", - "FastTextWrapper", -] \ No newline at end of file diff --git a/torchTextClassifiers/classifiers/fasttext/core.py b/torchTextClassifiers/classifiers/fasttext/core.py deleted file mode 100644 index c9c24b0..0000000 --- a/torchTextClassifiers/classifiers/fasttext/core.py +++ /dev/null @@ -1,269 +0,0 @@ -"""FastText classifier core components. - -This module contains the core components for FastText classification: -- Configuration dataclass -- Loss functions -- Factory methods for creating classifiers - -Consolidates what was previously in config.py, losses.py, and factory.py. -""" - -from dataclasses import dataclass, field, asdict -from abc import ABC, abstractmethod -from ..base import BaseClassifierConfig -from typing import Optional, List, TYPE_CHECKING, Union, Dict, Any -import numpy as np -import torch -import torch.nn.functional as F -from torch import nn - -if TYPE_CHECKING: - from ...torchTextClassifiers import torchTextClassifiers - - -# ============================================================================ -# Configuration -# ============================================================================ - -@dataclass -class FastTextConfig(BaseClassifierConfig): - """Configuration for FastText classifier.""" - # Embedding matrix - embedding_dim: int - sparse: bool - - # Tokenizer-related - num_tokens: int - min_count: int - min_n: int - max_n: int - len_word_ngrams: int - - # Optional parameters - num_classes: Optional[int] = None - num_rows: Optional[int] = None - - # Categorical variables - categorical_vocabulary_sizes: Optional[List[int]] = None - categorical_embedding_dims: Optional[Union[List[int], int]] = None - num_categorical_features: Optional[int] = None - - # Model-specific parameters - direct_bagging: Optional[bool] = True - - # Training parameters - learning_rate: float = 4e-3 - - def to_dict(self) -> Dict[str, Any]: - return asdict(self) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "FastTextConfig": - return cls(**data) - - -# ============================================================================ -# Loss Functions -# ============================================================================ - -class OneVsAllLoss(nn.Module): - def __init__(self): - super(OneVsAllLoss, self).__init__() - - def forward(self, logits, targets): - """ - Compute One-vs-All loss - - Args: - logits: Tensor of shape (batch_size, num_classes) containing classification scores - targets: Tensor of shape (batch_size) containing true class indices - - Returns: - loss: Mean loss value across the batch - """ - - num_classes = logits.size(1) - - # Convert targets to one-hot encoding - targets_one_hot = F.one_hot(targets, num_classes=num_classes).float() - - # For each sample, treat the true class as positive and all others as negative - # Using binary cross entropy for each class - loss = F.binary_cross_entropy_with_logits( - logits, # Raw logits - targets_one_hot, # Target probabilities - reduction="none", # Don't reduce yet to allow for custom weighting if needed - ) - - # Sum losses across all classes for each sample, then take mean across batch - return loss.sum(dim=1).mean() - - -# ============================================================================ -# Factory Methods -# ============================================================================ - -class FastTextFactory: - """Factory class for creating FastText classifiers with convenience methods. - - This factory provides static methods for creating FastText classifiers with - common configurations. It handles the complexities of configuration creation - and classifier initialization, offering a simplified API for users. - - All methods return fully initialized torchTextClassifiers instances that are - ready for building and training. - """ - - @staticmethod - def create_fasttext( - embedding_dim: int, - sparse: bool, - num_tokens: int, - min_count: int, - min_n: int, - max_n: int, - len_word_ngrams: int, - **kwargs - ) -> "torchTextClassifiers": - """Create a FastText classifier with the specified configuration. - - This is the primary method for creating FastText classifiers. It creates - a configuration object with the provided parameters and initializes a - complete classifier instance. - - Args: - embedding_dim: Dimension of word embeddings - sparse: Whether to use sparse embeddings - num_tokens: Maximum number of tokens in vocabulary - min_count: Minimum count for tokens to be included in vocabulary - min_n: Minimum length of character n-grams - max_n: Maximum length of character n-grams - len_word_ngrams: Length of word n-grams to use - **kwargs: Additional configuration parameters (e.g., num_classes, - categorical_vocabulary_sizes, etc.) - - Returns: - torchTextClassifiers: Initialized FastText classifier instance - - Example: - >>> from torchTextClassifiers.classifiers.fasttext.core import FastTextFactory - >>> classifier = FastTextFactory.create_fasttext( - ... embedding_dim=100, - ... sparse=False, - ... num_tokens=10000, - ... min_count=2, - ... min_n=3, - ... max_n=6, - ... len_word_ngrams=2, - ... num_classes=3 - ... ) - """ - from ...torchTextClassifiers import torchTextClassifiers - from .wrapper import FastTextWrapper - - config = FastTextConfig( - embedding_dim=embedding_dim, - sparse=sparse, - num_tokens=num_tokens, - min_count=min_count, - min_n=min_n, - max_n=max_n, - len_word_ngrams=len_word_ngrams, - **kwargs - ) - wrapper = FastTextWrapper(config) - return torchTextClassifiers(wrapper) - - @staticmethod - def build_from_tokenizer( - tokenizer, # NGramTokenizer - embedding_dim: int, - num_classes: Optional[int], - categorical_vocabulary_sizes: Optional[List[int]] = None, - sparse: bool = False, - **kwargs - ) -> "torchTextClassifiers": - """Create FastText classifier from an existing trained tokenizer. - - This method is useful when you have a pre-trained tokenizer and want to - create a classifier that uses the same vocabulary and tokenization scheme. - The resulting classifier will have its tokenizer and model architecture - pre-built. - - Args: - tokenizer: Pre-trained NGramTokenizer instance - embedding_dim: Dimension of word embeddings - num_classes: Number of output classes - categorical_vocabulary_sizes: Sizes of categorical feature vocabularies - sparse: Whether to use sparse embeddings - **kwargs: Additional configuration parameters - - Returns: - torchTextClassifiers: Classifier with pre-built tokenizer and model - - Raises: - ValueError: If the tokenizer is missing required attributes - - Example: - >>> # Assume you have a pre-trained tokenizer - >>> classifier = FastTextFactory.build_from_tokenizer( - ... tokenizer=my_tokenizer, - ... embedding_dim=100, - ... num_classes=2, - ... sparse=False - ... ) - >>> # The classifier is ready for training without building - >>> classifier.train(X_train, y_train, X_val, y_val, ...) - """ - from ...torchTextClassifiers import torchTextClassifiers - from .wrapper import FastTextWrapper - - # Ensure the tokenizer has required attributes - required_attrs = ["min_count", "min_n", "max_n", "num_tokens", "word_ngrams"] - if not all(hasattr(tokenizer, attr) for attr in required_attrs): - missing_attrs = [attr for attr in required_attrs if not hasattr(tokenizer, attr)] - raise ValueError(f"Missing attributes in tokenizer: {missing_attrs}") - - config = FastTextConfig( - num_tokens=tokenizer.num_tokens, - embedding_dim=embedding_dim, - min_count=tokenizer.min_count, - min_n=tokenizer.min_n, - max_n=tokenizer.max_n, - len_word_ngrams=tokenizer.word_ngrams, - sparse=sparse, - num_classes=num_classes, - categorical_vocabulary_sizes=categorical_vocabulary_sizes, - **kwargs - ) - - wrapper = FastTextWrapper(config) - classifier = torchTextClassifiers(wrapper) - classifier.classifier.tokenizer = tokenizer - classifier.classifier._build_pytorch_model() - - return classifier - - @staticmethod - def from_dict(config_dict: dict) -> FastTextConfig: - """Create FastText configuration from dictionary. - - This method is used internally by the configuration factory system - to recreate FastText configurations from serialized data. - - Args: - config_dict: Dictionary containing configuration parameters - - Returns: - FastTextConfig: Reconstructed configuration object - - Example: - >>> config_dict = { - ... 'embedding_dim': 100, - ... 'num_tokens': 5000, - ... 'min_count': 1, - ... # ... other parameters - ... } - >>> config = FastTextFactory.from_dict(config_dict) - """ - return FastTextConfig.from_dict(config_dict) \ No newline at end of file diff --git a/torchTextClassifiers/classifiers/fasttext/model.py b/torchTextClassifiers/classifiers/fasttext/model.py deleted file mode 100644 index ac8cc7f..0000000 --- a/torchTextClassifiers/classifiers/fasttext/model.py +++ /dev/null @@ -1,752 +0,0 @@ -"""FastText model components. - -This module contains the PyTorch model, Lightning module, and dataset classes -for FastText classification. Consolidates what was previously in pytorch_model.py, -lightning_module.py, and dataset.py. -""" - -import os -import logging -from typing import List, Union -import torch -import pytorch_lightning as pl -from torch import nn -from torchmetrics import Accuracy - -try: - from captum.attr import LayerIntegratedGradients - HAS_CAPTUM = True -except ImportError: - HAS_CAPTUM = False - -from ...utilities.utils import ( - compute_preprocessed_word_score, - compute_word_score, - explain_continuous, -) -from ...utilities.checkers import validate_categorical_inputs - -logger = logging.getLogger(__name__) - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - handlers=[logging.StreamHandler()], -) - - -# ============================================================================ -# PyTorch Model -# ============================================================================ - -class FastTextModel(nn.Module): - """FastText Pytorch Model.""" - - def __init__( - self, - embedding_dim: int, - num_classes: int, - tokenizer=None, - num_rows: int = None, - categorical_vocabulary_sizes: List[int] = None, - categorical_embedding_dims: Union[List[int], int] = None, - padding_idx: int = 0, - sparse: bool = True, - direct_bagging: bool = False, - ): - """ - Constructor for the FastTextModel class. - - Args: - embedding_dim (int): Dimension of the text embedding space. - buckets (int): Number of rows in the embedding matrix. - num_classes (int): Number of classes. - categorical_vocabulary_sizes (List[int]): List of the number of - modalities for additional categorical features. - padding_idx (int, optional): Padding index for the text - descriptions. Defaults to 0. - sparse (bool): Indicates if Embedding layer is sparse. - direct_bagging (bool): Use EmbeddingBag instead of Embedding for the text embedding. - """ - super(FastTextModel, self).__init__() - - if isinstance(categorical_embedding_dims, int): - self.average_cat_embed = True # if provided categorical embedding dims is an int, average the categorical embeddings before concatenating to sentence embedding - else: - self.average_cat_embed = False - - categorical_vocabulary_sizes, categorical_embedding_dims, num_categorical_features = ( - validate_categorical_inputs( - categorical_vocabulary_sizes, - categorical_embedding_dims, - num_categorical_features=None, - ) - ) - - assert isinstance(categorical_embedding_dims, list) or categorical_embedding_dims is None, ( - "categorical_embedding_dims must be a list of int at this stage" - ) - - if categorical_embedding_dims is None: - self.average_cat_embed = False - - if tokenizer is None: - if num_rows is None: - raise ValueError( - "Either tokenizer or num_rows must be provided (number of rows in the embedding matrix)." - ) - else: - if num_rows is not None: - if num_rows != tokenizer.num_tokens: - logger.warning( - "num_rows is different from the number of tokens in the tokenizer. Using provided num_rows." - ) - - self.num_rows = num_rows - - self.num_classes = num_classes - self.padding_idx = padding_idx - self.tokenizer = tokenizer - self.embedding_dim = embedding_dim - self.direct_bagging = direct_bagging - self.sparse = sparse - - self.categorical_embedding_dims = categorical_embedding_dims - - self.embeddings = ( - nn.Embedding( - embedding_dim=embedding_dim, - num_embeddings=num_rows, - padding_idx=padding_idx, - sparse=sparse, - ) - if not direct_bagging - else nn.EmbeddingBag( - embedding_dim=embedding_dim, - num_embeddings=num_rows, - padding_idx=padding_idx, - sparse=sparse, - mode="mean", - ) - ) - - self.categorical_embedding_layers = {} - - # Entry dim for the last layer: - # 1. embedding_dim if no categorical variables or summing the categrical embeddings to sentence embedding - # 2. embedding_dim + cat_embedding_dim if averaging the categorical embeddings before concatenating to sentence embedding (categorical_embedding_dims is a int) - # 3. embedding_dim + sum(categorical_embedding_dims) if concatenating individually the categorical embeddings to sentence embedding (no averaging, categorical_embedding_dims is a list) - dim_in_last_layer = embedding_dim - if self.average_cat_embed: - dim_in_last_layer += categorical_embedding_dims[0] - - if categorical_vocabulary_sizes is not None: - self.no_cat_var = False - for var_idx, num_rows in enumerate(categorical_vocabulary_sizes): - if categorical_embedding_dims is not None: - emb = nn.Embedding( - embedding_dim=categorical_embedding_dims[var_idx], num_embeddings=num_rows - ) # concatenate to sentence embedding - if not self.average_cat_embed: - dim_in_last_layer += categorical_embedding_dims[var_idx] - else: - emb = nn.Embedding( - embedding_dim=embedding_dim, num_embeddings=num_rows - ) # sum to sentence embedding - self.categorical_embedding_layers[var_idx] = emb - setattr(self, "emb_{}".format(var_idx), emb) - else: - self.no_cat_var = True - - self.fc = nn.Linear(dim_in_last_layer, num_classes) - - def forward(self, encoded_text: torch.Tensor, additional_inputs: torch.Tensor) -> torch.Tensor: - """ - Memory-efficient forward pass implementation. - - Args: - encoded_text (torch.Tensor[Long]), shape (batch_size, seq_len): Tokenized + padded text - additional_inputs (torch.Tensor[Long]): Additional categorical features, (batch_size, num_categorical_features) - - Returns: - torch.Tensor: Model output scores for each class - """ - batch_size = encoded_text.size(0) - - # Ensure correct dtype and device once - if encoded_text.dtype != torch.long: - encoded_text = encoded_text.to(torch.long) - - # Compute text embeddings - if self.direct_bagging: - x_text = self.embeddings(encoded_text) # (batch_size, embedding_dim) - else: - # Compute embeddings and averaging in a memory-efficient way - x_text = self.embeddings(encoded_text) # (batch_size, seq_len, embedding_dim) - # Calculate non-zero tokens mask once - non_zero_mask = (x_text.sum(-1) != 0).float() # (batch_size, seq_len) - token_counts = non_zero_mask.sum(-1, keepdim=True) # (batch_size, 1) - - # Sum and average in place - x_text = (x_text * non_zero_mask.unsqueeze(-1)).sum( - dim=1 - ) # (batch_size, embedding_dim) - x_text = torch.div(x_text, token_counts.clamp(min=1.0)) - x_text = torch.nan_to_num(x_text, 0.0) - - # Handle categorical variables efficiently - if not self.no_cat_var and additional_inputs.numel() > 0: - cat_embeds = [] - # Process categorical embeddings in batch - for i, (_, embed_layer) in enumerate(self.categorical_embedding_layers.items()): - cat_input = additional_inputs[:, i].long() - - # Check if categorical values are within valid range and clamp if needed - vocab_size = embed_layer.num_embeddings - max_val = cat_input.max().item() - min_val = cat_input.min().item() - - if max_val >= vocab_size or min_val < 0: - logger.warning(f"Categorical feature {i}: values range [{min_val}, {max_val}] exceed vocabulary size {vocab_size}. Clamping to valid range [0, {vocab_size - 1}]") - # Clamp values to valid range - cat_input = torch.clamp(cat_input, 0, vocab_size - 1) - - cat_embed = embed_layer(cat_input) - if cat_embed.dim() > 2: - cat_embed = cat_embed.squeeze(1) - cat_embeds.append(cat_embed) - - if cat_embeds: # If we have categorical embeddings - if self.categorical_embedding_dims is not None: - if self.average_cat_embed: - # Stack and average in one operation - x_cat = torch.stack(cat_embeds, dim=0).mean(dim=0) - x_combined = torch.cat([x_text, x_cat], dim=1) - else: - # Optimize concatenation - x_combined = torch.cat([x_text] + cat_embeds, dim=1) - else: - # Sum embeddings efficiently - x_combined = x_text + torch.stack(cat_embeds, dim=0).sum(dim=0) - else: - x_combined = x_text - else: - x_combined = x_text - - # Final linear layer - return self.fc(x_combined) - - def predict( - self, - text: List[str], - categorical_variables: List[List[int]], - top_k=1, - explain=False, - preprocess=True, - ): - """ - Args: - text (List[str]): A list of text observations. - params (Optional[Dict[str, Any]]): Additional parameters to - pass to the model for inference. - top_k (int): for each sentence, return the top_k most likely predictions (default: 1) - explain (bool): launch gradient integration to have an explanation of the prediction (default: False) - preprocess (bool): If True, preprocess text. Needs unidecode library. - - Returns: - if explain is False: - predictions (torch.Tensor, shape (len(text), top_k)): A tensor containing the top_k most likely codes to the query. - confidence (torch.Tensor, shape (len(text), top_k)): A tensor array containing the corresponding confidence scores. - if explain is True: - predictions (torch.Tensor, shape (len(text), top_k)): Containing the top_k most likely codes to the query. - confidence (torch.Tensor, shape (len(text), top_k)): Corresponding confidence scores. - all_attributions (torch.Tensor, shape (len(text), top_k, seq_len)): A tensor containing the attributions for each token in the text. - x (torch.Tensor): A tensor containing the token indices of the text. - id_to_token_dicts (List[Dict[int, str]]): A list of dictionaries mapping token indices to tokens (one for each sentence). - token_to_id_dicts (List[Dict[str, int]]): A list of dictionaries mapping tokens to token indices: the reverse of those in id_to_token_dicts. - text (list[str]): A plist containing the preprocessed text (one line for each sentence). - """ - - flag_change_embed = False - if explain: - if not HAS_CAPTUM: - raise ImportError( - "Captum is not installed and is required for explainability. Run 'pip install torchFastText[explainability]'." - ) - if self.direct_bagging: - # Get back the classical embedding layer for explainability - new_embed_layer = nn.Embedding( - embedding_dim=self.embedding_dim, - num_embeddings=self.num_rows, - padding_idx=self.padding_idx, - sparse=self.sparse, - ) - new_embed_layer.load_state_dict( - self.embeddings.state_dict() - ) # No issues, as exactly the same parameters - self.embeddings = new_embed_layer - self.direct_bagging = ( - False # To inform the forward pass that we are not using EmbeddingBag anymore - ) - flag_change_embed = True - - lig = LayerIntegratedGradients( - self, self.embeddings - ) # initialize a Captum layer gradient integrator - - self.eval() - batch_size = len(text) - - indices_batch, id_to_token_dicts, token_to_id_dicts = self.tokenizer.tokenize( - text, text_tokens=False, preprocess=preprocess - ) - - padding_index = ( - self.tokenizer.get_buckets() + self.tokenizer.get_nwords() - ) # padding index, the integer value of the padding token - - padded_batch = torch.nn.utils.rnn.pad_sequence( - indices_batch, - batch_first=True, - padding_value=padding_index, - ) # (batch_size, seq_len) - Tokenized (int) + padded text - - x = padded_batch - - if not self.no_cat_var: - other_features = [] - # Transpose categorical_variables to iterate over features instead of samples - categorical_variables_transposed = categorical_variables.T - for i, categorical_variable in enumerate(categorical_variables_transposed): - other_features.append( - torch.tensor(categorical_variable).reshape(batch_size, -1).to(torch.int64) - ) - - other_features = torch.stack(other_features).reshape(batch_size, -1).long() - else: - other_features = torch.empty(batch_size) - - pred = self( - x, other_features - ) # forward pass, contains the prediction scores (len(text), num_classes) - label_scores = pred.detach().cpu() - label_scores_topk = torch.topk(label_scores, k=top_k, dim=1) - - predictions = label_scores_topk.indices # get the top_k most likely predictions - confidence = torch.round(label_scores_topk.values, decimals=2) # and their scores - - if explain: - assert not self.direct_bagging, "Direct bagging should be False for explainability" - all_attributions = [] - for k in range(top_k): - attributions = lig.attribute( - (x, other_features), target=torch.Tensor(predictions[:, k]).long() - ) # (batch_size, seq_len) - attributions = attributions.sum(dim=-1) - all_attributions.append(attributions.detach().cpu()) - - all_attributions = torch.stack(all_attributions, dim=1) # (batch_size, top_k, seq_len) - - # Get back to initial embedding layer: - # EmbeddingBag -> Embedding -> EmbeddingBag - # or keep Embedding with no change - if flag_change_embed: - new_embed_layer = nn.EmbeddingBag( - embedding_dim=self.embedding_dim, - num_embeddings=self.num_rows, - padding_idx=self.padding_idx, - sparse=self.sparse, - ) - new_embed_layer.load_state_dict( - self.embeddings.state_dict() - ) # No issues, as exactly the same parameters - self.embeddings = new_embed_layer - self.direct_bagging = True - return ( - predictions, - confidence, - all_attributions, - x, - id_to_token_dicts, - token_to_id_dicts, - text, - ) - else: - return predictions, confidence - - def predict_and_explain(self, text, categorical_variables, top_k=1, n=5, cutoff=0.65): - """ - Args: - text (List[str]): A list of sentences. - params (Optional[Dict[str, Any]]): Additional parameters to - pass to the model for inference. - top_k (int): for each sentence, return the top_k most likely predictions (default: 1) - n (int): mapping processed to original words: max number of candidate processed words to consider per original word (default: 5) - cutoff (float): mapping processed to original words: minimum similarity score to consider a candidate processed word (default: 0.75) - - Returns: - predictions (torch.Tensor, shape (len(text), top_k)): Containing the top_k most likely codes to the query. - confidence (torch.Tensor, shape (len(text), top_k)): Corresponding confidence scores. - all_scores (List[List[List[float]]]): For each sentence, list of the top_k lists of attributions for each word in the sentence (one for each pred). - """ - - # Step 1: Get the predictions, confidence scores and attributions at token level - ( - pred, - confidence, - all_attr, - tokenized_text, - id_to_token_dicts, - token_to_id_dicts, - processed_text, - ) = self.predict( - text=text, categorical_variables=categorical_variables, top_k=top_k, explain=True - ) - - tokenized_text_tokens = self.tokenizer._tokenized_text_in_tokens( - tokenized_text, id_to_token_dicts - ) - - # Step 2: Map the attributions at token level to the processed words - processed_word_to_score_dicts, processed_word_to_token_idx_dicts = ( - compute_preprocessed_word_score( - processed_text, - tokenized_text_tokens, - all_attr, - id_to_token_dicts, - token_to_id_dicts, - min_n=self.tokenizer.min_n, - padding_index=self.padding_idx, - end_of_string_index=0, - ) - ) - - # Step 3: Map the processed words to the original words - all_scores, orig_to_processed_mappings = compute_word_score( - processed_word_to_score_dicts, text, n=n, cutoff=cutoff - ) - - # Step 2bis: Get the attributions at letter level - all_scores_letters = explain_continuous( - text, - processed_text, - tokenized_text_tokens, - orig_to_processed_mappings, - processed_word_to_token_idx_dicts, - all_attr, - top_k, - ) - - return pred, confidence, all_scores, all_scores_letters - - -# ============================================================================ -# PyTorch Lightning Module -# ============================================================================ - -class FastTextModule(pl.LightningModule): - """Pytorch Lightning Module for FastTextModel.""" - - def __init__( - self, - model: FastTextModel, - loss, - optimizer, - optimizer_params, - scheduler, - scheduler_params, - scheduler_interval="epoch", - **kwargs, - ): - """ - Initialize FastTextModule. - - Args: - model: Model. - loss: Loss - optimizer: Optimizer - optimizer_params: Optimizer parameters. - scheduler: Scheduler. - scheduler_params: Scheduler parameters. - scheduler_interval: Scheduler interval. - """ - super().__init__() - self.save_hyperparameters(ignore=["model", "loss"]) - - self.model = model - self.loss = loss - self.accuracy_fn = Accuracy(task="multiclass", num_classes=self.model.num_classes) - self.optimizer = optimizer - self.optimizer_params = optimizer_params - self.scheduler = scheduler - self.scheduler_params = scheduler_params - self.scheduler_interval = scheduler_interval - - def forward(self, inputs) -> torch.Tensor: - """ - Perform forward-pass. - - Args: - batch (List[torch.LongTensor]): Batch to perform forward-pass on. - - Returns (torch.Tensor): Prediction. - """ - return self.model(inputs[0], inputs[1]) - - def training_step(self, batch, batch_idx: int) -> torch.Tensor: - """ - Training step. - - Args: - batch (List[torch.LongTensor]): Training batch. - batch_idx (int): Batch index. - - Returns (torch.Tensor): Loss tensor. - """ - - inputs, targets = batch[:-1], batch[-1] - outputs = self.forward(inputs) - loss = self.loss(outputs, targets) - self.log("train_loss", loss, on_epoch=True, on_step=True, prog_bar=True) - accuracy = self.accuracy_fn(outputs, targets) - self.log("train_accuracy", accuracy, on_epoch=True, on_step=False, prog_bar=True) - - torch.cuda.empty_cache() - - return loss - - def validation_step(self, batch, batch_idx: int): - """ - Validation step. - - Args: - batch (List[torch.LongTensor]): Validation batch. - batch_idx (int): Batch index. - - Returns (torch.Tensor): Loss tensor. - """ - inputs, targets = batch[:-1], batch[-1] - outputs = self.forward(inputs) - loss = self.loss(outputs, targets) - self.log("val_loss", loss, on_epoch=True, on_step=False, prog_bar=True, sync_dist=True) - - accuracy = self.accuracy_fn(outputs, targets) - self.log("val_accuracy", accuracy, on_epoch=True, on_step=False, prog_bar=True) - return loss - - def test_step(self, batch, batch_idx: int): - """ - Test step. - - Args: - batch (List[torch.LongTensor]): Test batch. - batch_idx (int): Batch index. - - Returns (torch.Tensor): Loss tensor. - """ - inputs, targets = batch[:-1], batch[-1] - outputs = self.forward(inputs) - loss = self.loss(outputs, targets) - - accuracy = self.accuracy_fn(outputs, targets) - - return loss, accuracy - - def configure_optimizers(self): - """ - Configure optimizer for Pytorch lighting. - - Returns: Optimizer and scheduler for pytorch lighting. - """ - optimizer = self.optimizer(self.parameters(), **self.optimizer_params) - - # Only use scheduler if it's not ReduceLROnPlateau or if we can ensure val_loss is available - # For complex training setups, sometimes val_loss is not available every epoch - if hasattr(self.scheduler, '__name__') and 'ReduceLROnPlateau' in self.scheduler.__name__: - # For ReduceLROnPlateau, use train_loss as it's always available - scheduler = self.scheduler(optimizer, **self.scheduler_params) - scheduler_config = { - "scheduler": scheduler, - "monitor": "train_loss", - "interval": self.scheduler_interval, - } - return [optimizer], [scheduler_config] - else: - # For other schedulers (StepLR, etc.), no monitoring needed - scheduler = self.scheduler(optimizer, **self.scheduler_params) - return [optimizer], [scheduler] - - -# ============================================================================ -# Dataset -# ============================================================================ - -class FastTextModelDataset(torch.utils.data.Dataset): - """FastTextModelDataset class.""" - - def __init__( - self, - categorical_variables: List[List[int]], - texts: List[str], - tokenizer, # NGramTokenizer - outputs: List[int] = None, - **kwargs, - ): - """ - Constructor for the TorchDataset class. - - Args: - categorical_variables (List[List[int]]): The elements of this list - are the values of each categorical variable across the dataset. - text (List[str]): List of text descriptions. - y (List[int]): List of outcomes. - tokenizer (Tokenizer): Tokenizer. - """ - - if categorical_variables is not None and len(categorical_variables) != len(texts): - raise ValueError("Categorical variables and texts must have the same length.") - - if outputs is not None and len(outputs) != len(texts): - raise ValueError("Outputs and texts must have the same length.") - - self.categorical_variables = categorical_variables - self.texts = texts - self.outputs = outputs - self.tokenizer = tokenizer - - def __len__(self) -> int: - """ - Returns length of the data. - - Returns: - int: Number of observations. - """ - return len(self.texts) - - def __str__(self) -> str: - """ - Returns description of the Dataset. - - Returns: - str: Description. - """ - return f"" - - def __getitem__(self, index: int) -> List: - """ - Returns observation for a given index. - - Args: - index (int): Index. - - Returns: - List[int, str]: Observation with given index. - """ - categorical_variables = ( - self.categorical_variables[index] if self.categorical_variables is not None else None - ) - text = self.texts[index] - - if self.outputs is not None: - y = self.outputs[index] - return text, categorical_variables, y - else: - return text, categorical_variables - - def collate_fn(self, batch): - """ - Efficient batch processing without explicit loops. - - Args: - batch: Data batch. - - Returns: - Tuple[torch.LongTensor]: Observation with given index. - """ - - # Unzip the batch in one go using zip(*batch) - if self.outputs is not None: - text, *categorical_vars, y = zip(*batch) - else: - text, *categorical_vars = zip(*batch) - - # Convert text to indices in parallel using map - indices_batch = list(map(lambda x: self.tokenizer.indices_matrix(x)[0], text)) - - # Get padding index once - padding_index = self.tokenizer.get_buckets() + self.tokenizer.get_nwords() - - # Pad sequences efficiently - padded_batch = torch.nn.utils.rnn.pad_sequence( - indices_batch, - batch_first=True, - padding_value=padding_index, - ) - - # Handle categorical variables efficiently - if self.categorical_variables is not None: - categorical_tensors = torch.stack( - [ - torch.tensor(cat_var, dtype=torch.float32) - for cat_var in categorical_vars[ - 0 - ] # Access first element since zip returns tuple - ] - ) - else: - categorical_tensors = torch.empty( - padded_batch.shape[0], 1, dtype=torch.float32, device=padded_batch.device - ) - - if self.outputs is not None: - # Convert labels to tensor in one go - y = torch.tensor(y, dtype=torch.long) - return (padded_batch, categorical_tensors, y) - else: - return (padded_batch, categorical_tensors) - - def create_dataloader( - self, - batch_size: int, - shuffle: bool = False, - drop_last: bool = False, - num_workers: int = os.cpu_count() - 1, - pin_memory: bool = True, - persistent_workers: bool = True, - **kwargs, - ) -> torch.utils.data.DataLoader: - """ - Creates a Dataloader from the FastTextModelDataset. - Use collate_fn() to tokenize and pad the sequences. - - Args: - batch_size (int): Batch size. - shuffle (bool, optional): Shuffle option. Defaults to False. - drop_last (bool, optional): Drop last option. Defaults to False. - num_workers (int, optional): Number of workers. Defaults to os.cpu_count() - 1. - pin_memory (bool, optional): Set True if working on GPU, False if CPU. Defaults to True. - persistent_workers (bool, optional): Set True for training, False for inference. Defaults to True. - **kwargs: Additional arguments for PyTorch DataLoader. - - Returns: - torch.utils.data.DataLoader: Dataloader. - """ - - logger.info(f"Creating DataLoader with {num_workers} workers.") - - # persistent_workers requires num_workers > 0 - if num_workers == 0: - persistent_workers = False - - return torch.utils.data.DataLoader( - dataset=self, - batch_size=batch_size, - collate_fn=self.collate_fn, - shuffle=shuffle, - drop_last=drop_last, - pin_memory=pin_memory, - num_workers=num_workers, - persistent_workers=persistent_workers, - **kwargs, - ) \ No newline at end of file diff --git a/torchTextClassifiers/classifiers/fasttext/tokenizer.py b/torchTextClassifiers/classifiers/fasttext/tokenizer.py deleted file mode 100644 index 7573952..0000000 --- a/torchTextClassifiers/classifiers/fasttext/tokenizer.py +++ /dev/null @@ -1,346 +0,0 @@ -""" -NGramTokenizer class. -""" - -import ctypes -import json -from typing import List, Tuple, Type, Dict - -import numpy as np -import torch -from torch import Tensor -from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass -from queue import Queue -import multiprocessing - -from ...utilities.preprocess import clean_text_feature - - -class NGramTokenizer: - """ - NGramTokenizer class. - """ - - def __init__( - self, - min_count: int, - min_n: int, - max_n: int, - num_tokens: int, - len_word_ngrams: int, - training_text: List[str], - **kwargs, - ): - """ - Constructor for the NGramTokenizer class. - - Args: - min_count (int): Minimum number of times a word has to be - in the training data to be given an embedding. - min_n (int): Minimum length of character n-grams. - max_n (int): Maximum length of character n-grams. - num_tokens (int): Number of rows in the embedding matrix. - word_ngrams (int): Maximum length of word n-grams. - training_text (List[str]): List of training texts. - - Raises: - ValueError: If `min_n` is 1 or smaller. - ValueError: If `max_n` is 7 or higher. - """ - if min_n < 2: - raise ValueError("`min_n` parameter must be greater than 1.") - if max_n > 6: - raise ValueError("`max_n` parameter must be smaller than 7.") - - self.min_count = min_count - self.min_n = min_n - self.max_n = max_n - self.num_tokens = num_tokens - self.word_ngrams = len_word_ngrams - - word_counts = {} - for sentence in training_text: - for word in sentence.split(" "): - word_counts[word] = word_counts.setdefault(word, 0) + 1 - - self.word_id_mapping = {} - i = 1 - for word, counts in word_counts.items(): - if word_counts[word] >= min_count: - self.word_id_mapping[word] = i - i += 1 - self.nwords = len(self.word_id_mapping) - - self.padding_index = self.num_tokens + self.get_nwords() - - def __str__(self) -> str: - """ - Returns description of the NGramTokenizer. - - Returns: - str: Description. - """ - return f"" - - def get_nwords(self) -> int: - """ - Return number of words kept in training data. - - Returns: - int: Number of words. - """ - return self.nwords - - def get_buckets(self) -> int: - """ - Return number of buckets for tokenizer. - - Returns: - int: Number of buckets. - """ - return self.num_tokens - - @staticmethod - def get_ngram_list(word: str, n: int) -> List[str]: - """ - Return the list of character n-grams for a word with a - given n. - - Args: - word (str): Word. - n (int): Length of the n-grams. - - Returns: - List[str]: List of character n-grams. - """ - return [word[i : i + n] for i in range(len(word) - n + 1)] - - @staticmethod - def get_hash(subword: str) -> int: - """ - Return hash for a given subword. - - Args: - subword (str): Character n-gram. - - Returns: - int: Corresponding hash. - """ - h = ctypes.c_uint32(2166136261).value - for c in subword: - c = ctypes.c_int8(ord(c)).value - h = ctypes.c_uint32(h ^ c).value - h = ctypes.c_uint32(h * 16777619).value - return h - - @staticmethod - def get_word_ngram_id(hashes: Tuple[int], bucket: int, nwords: int) -> int: - """ - Get word ngram index in the embedding matrix. - - Args: - hashes (Tuple[int]): Word hashes. - bucket (int): Number of rows in embedding matrix. - nwords (int): Number of words in the vocabulary. - - Returns: - int: Word ngram hash. - """ - hashes = [ctypes.c_int32(hash_value).value for hash_value in hashes] - h = ctypes.c_uint64(hashes[0]).value - for j in range(1, len(hashes)): - h = ctypes.c_uint64((h * 116049371)).value - h = ctypes.c_uint64(h + hashes[j]).value - return h % bucket + nwords - - def get_subword_index(self, subword: str) -> int: - """ - Return the row index from the embedding matrix which - corresponds to a character n-gram. - - Args: - subword (str): Character n-gram. - - Returns: - int: Index. - """ - return self.get_hash(subword) % self.num_tokens + self.nwords - - def get_word_index(self, word: str) -> int: - """ - Return the row index from the embedding matrix which - corresponds to a word. - - Args: - word (str): Word. - - Returns: - int: Index. - """ - return self.word_id_mapping[word] - - def get_subwords(self, word: str) -> Tuple[List[str], List[int]]: - """ - Return all subwords tokens and indices for a given word. - Also adds the whole word token and indice if the word is in word_id_mapping - (==> the word is in initial vocabulary + seen at least MIN_COUNT times). - Adds tags "<" and ">" to the word. - - Args: - word (str): Word. - - Returns: - Tuple[List[str], List[int]]: Tuple of tokens and indices. - """ - tokens = [] - word_with_tags = "<" + word + ">" - - # Get subwords and associated indices WITHOUT the whole word - for n in range(self.min_n, self.max_n + 1): - ngrams = self.get_ngram_list(word_with_tags, n) - tokens += [ - ngram for ngram in ngrams if ngram != word_with_tags and ngram != word - ] # Exclude the full word - - indices = [self.get_subword_index(token) for token in tokens] - assert word not in tokens - - # Add word token and indice only if the word is in word_id_mapping - if word in self.word_id_mapping.keys(): - self.get_word_index(word) - tokens = [word] + tokens - indices = [self.get_word_index(word)] + indices - - return (tokens, indices) - - def indices_matrix(self, sentence: str) -> tuple[torch.Tensor, dict, dict]: - """ - Returns an array of token indices for a text description. - - Args: - sentence (str): Text description. - - Returns: - tuple: (torch.Tensor of indices, id_to_token dict, token_to_id dict) - """ - # Pre-split the sentence once - words = sentence.split() - words.append("") # Add end of string token - - indices = [] - all_tokens_id = {} - - # Process subwords in one batch - for word in words[:-1]: # Exclude from subword processing - tokens, ind = self.get_subwords(word) - indices.extend(ind) - # Update dictionary with zip for efficiency - all_tokens_id.update(zip(tokens, ind)) - - # Add token - indices.append(0) - all_tokens_id[""] = 0 - - # Compute word n-grams more efficiently - if self.word_ngrams > 1: - # Pre-compute hashes for all words to avoid repeated computation - word_hashes = [self.get_hash(word) for word in words] - - # Generate n-grams using sliding window - word_ngram_ids = [] - for n in range(2, self.word_ngrams + 1): - for i in range(len(words) - n + 1): - # Get slice of hashes for current n-gram - gram_hashes = tuple(word_hashes[i : i + n]) - - # Compute n-gram ID - word_ngram_id = int( - self.get_word_ngram_id(gram_hashes, self.num_tokens, self.nwords) - ) - - # Store gram and its ID - gram = " ".join(words[i : i + n]) - all_tokens_id[gram] = word_ngram_id - word_ngram_ids.append(word_ngram_id) - - # Extend indices with n-gram IDs - indices.extend(word_ngram_ids) - - # Create reverse mapping once at the end - id_to_token = {v: k for k, v in all_tokens_id.items()} - - # Convert to tensor directly - return torch.tensor(indices, dtype=torch.long), id_to_token, all_tokens_id - - def tokenize(self, text: list[str], text_tokens=True, preprocess=False): - """ - Tokenize a list of sentences. - - Args: - text (list[str]): List of sentences. - text_tokens (bool): If True, return tokenized text in tokens. - preprocess (bool): If True, preprocess text. Needs unidecode library. - - Returns: - np.array: Array of indices. - """ - - if preprocess: - text = clean_text_feature(text) - - tokenized_text = [] - id_to_token_dicts = [] - token_to_id_dicts = [] - for sentence in text: - all_ind, id_to_token, token_to_id = self.indices_matrix( - sentence - ) # tokenize and convert to token indices - tokenized_text.append(all_ind) - id_to_token_dicts.append(id_to_token) - token_to_id_dicts.append(token_to_id) - - if text_tokens: - tokenized_text_tokens = self._tokenized_text_in_tokens( - tokenized_text, id_to_token_dicts - ) - return tokenized_text_tokens, tokenized_text, id_to_token_dicts, token_to_id_dicts - else: - return tokenized_text, id_to_token_dicts, token_to_id_dicts - - def _tokenized_text_in_tokens(self, tokenized_text, id_to_token_dicts): - """ - Convert tokenized text in int format to tokens in str format (given a mapping dictionary). - Private method. Used in tokenizer.tokenize and pytorch_model.predict() - - Args: - tokenized_text (list): List of tokenized text in int format. - id_to_token_dicts (list[Dict]): List of dictionaries mapping token indices to tokens. - - Both lists have the same length (number of sentences). - - Returns: - list[list[str]]: List of tokenized text in str format. - - """ - - return [ - [ - id_to_token_dicts[i][token_id.item()] - for token_id in tokenized_sentence - if token_id.item() not in {self.padding_index} - ] - for i, tokenized_sentence in enumerate(tokenized_text) - ] - - def get_vocab(self): - return self.word_id_mapping - - @classmethod - def from_json(cls: Type["NGramTokenizer"], filepath: str, training_text) -> "NGramTokenizer": - """ - Load a dataclass instance from a JSON file. - """ - with open(filepath, "r") as f: - data = json.load(f) - return cls(**data, training_text=training_text) diff --git a/torchTextClassifiers/classifiers/fasttext/wrapper.py b/torchTextClassifiers/classifiers/fasttext/wrapper.py deleted file mode 100644 index 3d1d559..0000000 --- a/torchTextClassifiers/classifiers/fasttext/wrapper.py +++ /dev/null @@ -1,216 +0,0 @@ -from typing import Optional -from ..base import BaseClassifierWrapper -from .core import FastTextConfig -from .tokenizer import NGramTokenizer -from .model import FastTextModel, FastTextModule, FastTextModelDataset -from ...utilities.checkers import check_X, check_Y -import logging -import numpy as np -import torch -from torch.optim import SGD, Adam - -logger = logging.getLogger() - - -class FastTextWrapper(BaseClassifierWrapper): - """Wrapper for FastText classifier.""" - - def __init__(self, config: FastTextConfig): - super().__init__(config) - self.config: FastTextConfig = config - self.tokenizer: Optional[NGramTokenizer] = None # FastText-specific tokenizer - - def prepare_text_features(self, training_text: np.ndarray) -> None: - """Build NGram tokenizer for FastText.""" - self.tokenizer = NGramTokenizer( - self.config.min_count, - self.config.min_n, - self.config.max_n, - self.config.num_tokens, - self.config.len_word_ngrams, - training_text, - ) - - def build_tokenizer(self, training_text: np.ndarray) -> None: - """Legacy method for backward compatibility.""" - self.prepare_text_features(training_text) - - def _build_pytorch_model(self) -> None: - """Build FastText PyTorch model.""" - if self.config.num_rows is None: - if self.tokenizer is None: - raise ValueError( - "Please provide a tokenizer or num_rows." - ) - else: - self.config.num_rows = self.tokenizer.padding_index + 1 - else: - if self.tokenizer is not None: - if self.config.num_rows != self.tokenizer.padding_index + 1: - logger.warning( - f"Divergent values for num_rows: {self.config.num_rows} and {self.tokenizer.padding_index + 1}. " - f"Using max value." - ) - self.config.num_rows = max(self.config.num_rows, self.tokenizer.padding_index + 1) - - self.padding_idx = self.config.num_rows - 1 - - # Update tokenizer padding index if necessary - if self.tokenizer is not None and self.padding_idx != self.tokenizer.padding_index: - self.tokenizer.padding_index = self.padding_idx - - self.pytorch_model = FastTextModel( - tokenizer=self.tokenizer, - embedding_dim=self.config.embedding_dim, - num_rows=self.config.num_rows, - num_classes=self.config.num_classes, - categorical_vocabulary_sizes=self.config.categorical_vocabulary_sizes, - categorical_embedding_dims=self.config.categorical_embedding_dims, - padding_idx=self.padding_idx, - sparse=self.config.sparse, - direct_bagging=self.config.direct_bagging, - ) - - def _check_and_init_lightning( - self, - optimizer=None, - optimizer_params=None, - lr=None, - scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau, - scheduler_params=None, - patience_scheduler=3, - loss=torch.nn.CrossEntropyLoss(), - ) -> None: - """Initialize Lightning module for FastText.""" - if optimizer is None: - if lr is None: - lr = getattr(self.config, 'learning_rate', 4e-3) # Use config or default - self.optimizer = SGD if self.config.sparse else Adam - self.optimizer_params = {"lr": lr} - else: - self.optimizer = optimizer - if optimizer_params is None: - if lr is not None: - self.optimizer_params = {"lr": lr} - else: - logger.warning("No optimizer parameters provided. Using defaults.") - self.optimizer_params = {} - - self.scheduler = scheduler - - if scheduler_params is None: - logger.warning("No scheduler parameters provided. Using defaults.") - self.scheduler_params = { - "mode": "min", - "patience": patience_scheduler, - } - else: - self.scheduler_params = scheduler_params - - self.loss = loss - - self.lightning_module = FastTextModule( - model=self.pytorch_model, - loss=self.loss, - optimizer=self.optimizer, - optimizer_params=self.optimizer_params, - scheduler=self.scheduler, - scheduler_params=self.scheduler_params, - scheduler_interval="epoch", - ) - - def predict(self, X: np.ndarray, top_k=1, preprocess=False, verbose=False) -> np.ndarray: - """Make predictions with FastText model.""" - if not self.trained: - raise Exception("Model must be trained first.") - - text, categorical_variables, no_cat_var = check_X(X) - if categorical_variables is not None: - if categorical_variables.shape[1] != self.config.num_categorical_features: - raise Exception( - f"X must have the same number of categorical variables as training data." - ) - else: - assert self.pytorch_model.no_cat_var == True - - predictions, confidence = self.pytorch_model.predict( - text, categorical_variables, top_k=top_k, preprocess=preprocess - ) - - # Return just predictions, squeeze out the top_k dimension if top_k=1 - if top_k == 1: - predictions = predictions.squeeze(-1) - - # Convert to numpy array for consistency - if hasattr(predictions, 'numpy'): - predictions = predictions.numpy() - - return predictions - - def validate(self, X: np.ndarray, Y: np.ndarray, batch_size=256, num_workers=12) -> float: - """Validate FastText model.""" - if not self.trained: - raise Exception("Model must be trained first.") - - # Use predict method which handles input validation and returns just predictions - predictions = self.predict(X) - y = check_Y(Y) - - # Convert predictions to numpy if it's a tensor - if hasattr(predictions, 'numpy'): - predictions = predictions.numpy() - - # Calculate accuracy - accuracy = (predictions == y).mean() - return float(accuracy) - - def predict_and_explain(self, X: np.ndarray, top_k=1): - """Predict and explain with FastText model.""" - if not self.trained: - raise Exception("Model must be trained first.") - - text, categorical_variables, no_cat_var = check_X(X) - if categorical_variables is not None: - if categorical_variables.shape[1] != self.config.num_categorical_features: - raise Exception( - f"X must have the same number of categorical variables as training data ({self.config.num_categorical_features})." - ) - else: - assert self.pytorch_model.no_cat_var == True - - return self.pytorch_model.predict_and_explain(text, categorical_variables, top_k=top_k) - - def create_dataset(self, texts: np.ndarray, labels: np.ndarray, categorical_variables: np.ndarray = None): - """Create FastText dataset.""" - return FastTextModelDataset( - categorical_variables=categorical_variables, - texts=texts, - outputs=labels, - tokenizer=self.tokenizer, - ) - - def create_dataloader(self, dataset, batch_size: int, num_workers: int = 0, shuffle: bool = True): - """Create FastText dataloader.""" - return dataset.create_dataloader(batch_size=batch_size, num_workers=num_workers, shuffle=shuffle) - - def load_best_model(self, checkpoint_path: str) -> None: - """Load best FastText model from checkpoint.""" - self.lightning_module = FastTextModule.load_from_checkpoint( - checkpoint_path, - model=self.pytorch_model, - loss=self.loss, - optimizer=self.optimizer, - optimizer_params=self.optimizer_params, - scheduler=self.scheduler, - scheduler_params=self.scheduler_params, - scheduler_interval="epoch", - ) - self.pytorch_model = self.lightning_module.model.to("cpu") - self.trained = True - self.pytorch_model.eval() - - @classmethod - def get_config_class(cls): - """Return the configuration class for FastText wrapper.""" - return FastTextConfig - diff --git a/torchTextClassifiers/classifiers/simple_text_classifier.py b/torchTextClassifiers/classifiers/simple_text_classifier.py deleted file mode 100644 index 12b2f15..0000000 --- a/torchTextClassifiers/classifiers/simple_text_classifier.py +++ /dev/null @@ -1,191 +0,0 @@ -""" -Simple text classifier example that doesn't require a tokenizer. - -This demonstrates how to create a classifier wrapper that uses -different text preprocessing approaches. -""" - -from typing import Optional, Dict, Any -from dataclasses import dataclass, asdict -import numpy as np -import torch -import torch.nn as nn -from sklearn.feature_extraction.text import TfidfVectorizer -from torch.utils.data import Dataset, DataLoader -import pytorch_lightning as pl -from torch.optim import Adam - -from .base import BaseClassifierWrapper, BaseClassifierConfig - - -@dataclass -class SimpleTextConfig(BaseClassifierConfig): - """Configuration for simple text classifier using TF-IDF.""" - - hidden_dim: int = 128 - num_classes: Optional[int] = None - max_features: int = 10000 - learning_rate: float = 1e-3 - dropout_rate: float = 0.1 - - def to_dict(self) -> Dict[str, Any]: - return asdict(self) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "SimpleTextConfig": - return cls(**data) - - -class SimpleTextDataset(Dataset): - """Dataset for simple text classifier.""" - - def __init__(self, features: np.ndarray, labels: np.ndarray): - self.features = torch.FloatTensor(features) - self.labels = torch.LongTensor(labels) - - def __len__(self): - return len(self.features) - - def __getitem__(self, idx): - return self.features[idx], self.labels[idx] - - -class SimpleTextModel(nn.Module): - """Simple neural network for text classification using TF-IDF features.""" - - def __init__(self, input_dim: int, hidden_dim: int, num_classes: int, dropout_rate: float = 0.1): - super().__init__() - - self.network = nn.Sequential( - nn.Linear(input_dim, hidden_dim), - nn.ReLU(), - nn.Dropout(dropout_rate), - nn.Linear(hidden_dim, hidden_dim // 2), - nn.ReLU(), - nn.Dropout(dropout_rate), - nn.Linear(hidden_dim // 2, num_classes) - ) - - def forward(self, x): - return self.network(x) - - -class SimpleTextModule(pl.LightningModule): - """Lightning module for simple text classifier.""" - - def __init__(self, model: nn.Module, learning_rate: float = 1e-3): - super().__init__() - self.model = model - self.learning_rate = learning_rate - self.loss_fn = nn.CrossEntropyLoss() - - def forward(self, x): - return self.model(x) - - def training_step(self, batch, batch_idx): - features, labels = batch - logits = self(features) - loss = self.loss_fn(logits, labels) - self.log('train_loss', loss) - return loss - - def validation_step(self, batch, batch_idx): - features, labels = batch - logits = self(features) - loss = self.loss_fn(logits, labels) - self.log('val_loss', loss) - return loss - - def configure_optimizers(self): - return Adam(self.parameters(), lr=self.learning_rate) - - -class SimpleTextWrapper(BaseClassifierWrapper): - """Wrapper for simple text classifier that uses TF-IDF instead of tokenization.""" - - def __init__(self, config: SimpleTextConfig): - super().__init__(config) - self.config: SimpleTextConfig = config - self.vectorizer: Optional[TfidfVectorizer] = None - - def prepare_text_features(self, training_text: np.ndarray) -> None: - """Prepare TF-IDF vectorizer instead of tokenizer.""" - self.vectorizer = TfidfVectorizer( - max_features=self.config.max_features, - lowercase=True, - stop_words='english' - ) - # Fit the vectorizer on training text - self.vectorizer.fit(training_text) - - def _build_pytorch_model(self) -> None: - """Build the PyTorch model.""" - if self.vectorizer is None: - raise ValueError("Must call prepare_text_features first") - - input_dim = len(self.vectorizer.get_feature_names_out()) - - self.pytorch_model = SimpleTextModel( - input_dim=input_dim, - hidden_dim=self.config.hidden_dim, - num_classes=self.config.num_classes, - dropout_rate=self.config.dropout_rate - ) - - def _check_and_init_lightning(self, **kwargs) -> None: - """Initialize Lightning module.""" - self.lightning_module = SimpleTextModule( - model=self.pytorch_model, - learning_rate=self.config.learning_rate - ) - - def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: - """Make predictions.""" - if not self.trained: - raise Exception("Model must be trained first.") - - # Extract text from X (assuming first column is text) - text_data = X[:, 0] if X.ndim > 1 else X - - # Transform text to TF-IDF features - features = self.vectorizer.transform(text_data).toarray() - features_tensor = torch.FloatTensor(features) - - self.pytorch_model.eval() - with torch.no_grad(): - logits = self.pytorch_model(features_tensor) - predictions = torch.argmax(logits, dim=1) - - return predictions.numpy() - - def validate(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - """Validate the model.""" - predictions = self.predict(X) - accuracy = (predictions == Y).mean() - return float(accuracy) - - def create_dataset(self, texts: np.ndarray, labels: np.ndarray, categorical_variables: Optional[np.ndarray] = None): - """Create dataset.""" - # Transform text to TF-IDF features - features = self.vectorizer.transform(texts).toarray() - return SimpleTextDataset(features, labels) - - def create_dataloader(self, dataset, batch_size: int, num_workers: int = 0, shuffle: bool = True): - """Create dataloader.""" - return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle) - - def load_best_model(self, checkpoint_path: str) -> None: - """Load best model from checkpoint.""" - self.lightning_module = SimpleTextModule.load_from_checkpoint( - checkpoint_path, - model=self.pytorch_model, - learning_rate=self.config.learning_rate - ) - self.pytorch_model = self.lightning_module.model - self.trained = True - self.pytorch_model.eval() - - @classmethod - def get_config_class(cls): - """Return the configuration class.""" - return SimpleTextConfig \ No newline at end of file diff --git a/torchTextClassifiers/dataset/__init__.py b/torchTextClassifiers/dataset/__init__.py new file mode 100644 index 0000000..246df81 --- /dev/null +++ b/torchTextClassifiers/dataset/__init__.py @@ -0,0 +1 @@ +from .dataset import TextClassificationDataset as TextClassificationDataset diff --git a/torchTextClassifiers/dataset/dataset.py b/torchTextClassifiers/dataset/dataset.py new file mode 100644 index 0000000..9e7b764 --- /dev/null +++ b/torchTextClassifiers/dataset/dataset.py @@ -0,0 +1,114 @@ +import os +from typing import List, Union + +import numpy as np +import torch +from torch.utils.data import DataLoader, Dataset + +from torchTextClassifiers.tokenizers import BaseTokenizer + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + + +class TextClassificationDataset(Dataset): + def __init__( + self, + texts: List[str], + categorical_variables: Union[List[List[int]], np.array, None], + tokenizer: BaseTokenizer, + labels: Union[List[int], None] = None, + ): + self.categorical_variables = categorical_variables + + self.texts = texts + + if hasattr(tokenizer, "trained") and not tokenizer.trained: + raise RuntimeError( + f"Tokenizer {type(tokenizer)} must be trained before creating dataset." + ) + + self.tokenizer = tokenizer + + self.texts = texts + self.tokenizer = tokenizer + self.labels = labels + + def __len__(self): + return len(self.texts) + + def __getitem__(self, idx): + if self.labels is not None: + return ( + str(self.texts[idx]), + ( + self.categorical_variables[idx] + if self.categorical_variables is not None + else None + ), + self.labels[idx], + ) + else: + return ( + str(self.texts[idx]), + ( + self.categorical_variables[idx] + if self.categorical_variables is not None + else None + ), + None, + ) + + def collate_fn(self, batch): + text, *categorical_vars, y = zip(*batch) + + if self.labels is not None: + labels_tensor = torch.tensor(y, dtype=torch.long) + else: + labels_tensor = None + + tokenize_output = self.tokenizer.tokenize(list(text)) + + if self.categorical_variables is not None: + categorical_tensors = torch.stack( + [ + torch.tensor(cat_var, dtype=torch.float32) + for cat_var in categorical_vars[ + 0 + ] # Access first element since zip returns tuple + ] + ) + else: + categorical_tensors = None + + return { + "input_ids": tokenize_output.input_ids, + "attention_mask": tokenize_output.attention_mask, + "categorical_vars": categorical_tensors, + "labels": labels_tensor, + } + + def create_dataloader( + self, + batch_size: int, + shuffle: bool = False, + drop_last: bool = False, + num_workers: int = os.cpu_count() - 1, + pin_memory: bool = False, + persistent_workers: bool = True, + **kwargs, + ): + # persistent_workers requires num_workers > 0 + if num_workers == 0: + persistent_workers = False + + return DataLoader( + dataset=self, + batch_size=batch_size, + collate_fn=self.collate_fn, + shuffle=shuffle, + drop_last=drop_last, + pin_memory=pin_memory, + num_workers=num_workers, + persistent_workers=persistent_workers, + **kwargs, + ) diff --git a/torchTextClassifiers/factories.py b/torchTextClassifiers/factories.py deleted file mode 100644 index e6a92a3..0000000 --- a/torchTextClassifiers/factories.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Generic factories for different classifier types.""" - -from typing import Dict, Any, Optional, Type, Callable -from .classifiers.base import BaseClassifierConfig - -# Registry of config factories for different classifier types -CONFIG_FACTORIES: Dict[str, Callable[[dict], BaseClassifierConfig]] = {} - - -def register_config_factory(classifier_type: str, factory_func: Callable[[dict], BaseClassifierConfig]): - """Register a config factory for a classifier type.""" - CONFIG_FACTORIES[classifier_type] = factory_func - - -def create_config_from_dict(classifier_type: str, config_dict: dict) -> BaseClassifierConfig: - """Create a config object from dictionary based on classifier type.""" - if classifier_type not in CONFIG_FACTORIES: - raise ValueError(f"Unsupported classifier type: {classifier_type}") - - return CONFIG_FACTORIES[classifier_type](config_dict) - - -# Register FastText factory -def _register_fasttext_factory(): - """Register FastText config factory.""" - try: - from .classifiers.fasttext.core import FastTextFactory - register_config_factory("fasttext", FastTextFactory.from_dict) - except ImportError: - pass # FastText module not available - - -# Auto-register available factories -_register_fasttext_factory() \ No newline at end of file diff --git a/torchTextClassifiers/model/__init__.py b/torchTextClassifiers/model/__init__.py new file mode 100644 index 0000000..29d16c9 --- /dev/null +++ b/torchTextClassifiers/model/__init__.py @@ -0,0 +1,2 @@ +from .lightning import TextClassificationModule as TextClassificationModule +from .model import TextClassificationModel as TextClassificationModel diff --git a/torchTextClassifiers/model/components/__init__.py b/torchTextClassifiers/model/components/__init__.py new file mode 100644 index 0000000..b14af0e --- /dev/null +++ b/torchTextClassifiers/model/components/__init__.py @@ -0,0 +1,12 @@ +from .attention import ( + AttentionConfig as AttentionConfig, +) +from .categorical_var_net import ( + CategoricalForwardType as CategoricalForwardType, +) +from .categorical_var_net import ( + CategoricalVariableNet as CategoricalVariableNet, +) +from .classification_head import ClassificationHead as ClassificationHead +from .text_embedder import TextEmbedder as TextEmbedder +from .text_embedder import TextEmbedderConfig as TextEmbedderConfig diff --git a/torchTextClassifiers/model/components/attention.py b/torchTextClassifiers/model/components/attention.py new file mode 100644 index 0000000..7c6474c --- /dev/null +++ b/torchTextClassifiers/model/components/attention.py @@ -0,0 +1,126 @@ +"""Largely inspired from Andrej Karpathy's nanochat, see here https://github.com/karpathy/nanochat/blob/master/nanochat/gpt.py""" + +from dataclasses import dataclass +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +### Some utils used in text_embedder.py for the attention blocks ### + + +def apply_rotary_emb(x, cos, sin): + assert x.ndim == 4 # multihead attention + + d = x.shape[3] // 2 + x1, x2 = x[..., :d], x[..., d:] # split up last time into two halves + y1 = x1 * cos + x2 * sin # rotate pairs of dims + y2 = x1 * (-sin) + x2 * cos + out = torch.cat([y1, y2], 3) # re-assemble + out = out.to(x.dtype) # ensure input/output dtypes match + return out + + +def norm(x): + # Purely functional rmsnorm with no learnable params + return F.rms_norm(x, (x.size(-1),)) + + +#### Config ##### +@dataclass +class AttentionConfig: + n_layers: int + n_head: int + n_kv_head: int + sequence_len: Optional[int] = None + positional_encoding: bool = True + aggregation_method: str = "mean" # or 'last', or 'first' + + +#### Attention Block ##### + +# Composed of SelfAttentionLayer and MLP with residual connections + + +class Block(nn.Module): + def __init__(self, config: AttentionConfig, layer_idx: int): + super().__init__() + + self.layer_idx = layer_idx + self.attn = SelfAttentionLayer(config, layer_idx) + self.mlp = MLP(config) + + def forward(self, x, cos_sin): + x = x + self.attn(norm(x), cos_sin) + x = x + self.mlp(norm(x)) + return x + + +##### Components of the Block ##### + + +class SelfAttentionLayer(nn.Module): + def __init__(self, config: AttentionConfig, layer_idx): + super().__init__() + self.layer_idx = layer_idx + self.n_head = config.n_head + self.n_kv_head = config.n_kv_head + self.enable_gqa = ( + self.n_head != self.n_kv_head + ) # Group Query Attention (GQA): duplicate key/value heads to match query heads if desired + self.n_embd = config.n_embd + self.head_dim = self.n_embd // self.n_head + assert self.n_embd % self.n_head == 0 + assert self.n_kv_head <= self.n_head and self.n_head % self.n_kv_head == 0 + self.c_q = nn.Linear(self.n_embd, self.n_head * self.head_dim, bias=False) + self.c_k = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False) + self.c_v = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False) + self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False) + + self.apply_positional_encoding = config.positional_encoding + + def forward(self, x, cos_sin=None): + B, T, C = x.size() + + # Project the input to get queries, keys, and values + q = self.c_q(x).view(B, T, self.n_head, self.head_dim) + k = self.c_k(x).view(B, T, self.n_kv_head, self.head_dim) + v = self.c_v(x).view(B, T, self.n_kv_head, self.head_dim) + + if self.apply_positional_encoding: + assert cos_sin is not None, "Rotary embeddings require precomputed cos/sin tensors" + cos, sin = cos_sin + q, k = ( + apply_rotary_emb(q, cos, sin), + apply_rotary_emb(k, cos, sin), + ) # QK rotary embedding + + q, k = norm(q), norm(k) # QK norm + q, k, v = ( + q.transpose(1, 2), + k.transpose(1, 2), + v.transpose(1, 2), + ) # make head be batch dim, i.e. (B, T, H, D) -> (B, H, T, D) + + # is_causal=False for non-autoregressive models (BERT-like) + y = F.scaled_dot_product_attention(q, k, v, is_causal=False, enable_gqa=self.enable_gqa) + + # Re-assemble the heads side by side and project back to residual stream + y = y.transpose(1, 2).contiguous().view(B, T, -1) + y = self.c_proj(y) + + return y + + +class MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=False) + self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=False) + + def forward(self, x): + x = self.c_fc(x) + x = F.relu(x).square() + x = self.c_proj(x) + return x diff --git a/torchTextClassifiers/model/components/categorical_var_net.py b/torchTextClassifiers/model/components/categorical_var_net.py new file mode 100644 index 0000000..973dd6c --- /dev/null +++ b/torchTextClassifiers/model/components/categorical_var_net.py @@ -0,0 +1,128 @@ +from enum import Enum +from typing import List, Optional, Union + +import torch +from torch import nn + + +class CategoricalForwardType(Enum): + SUM_TO_TEXT = "EMBEDDING_SUM_TO_TEXT" + AVERAGE_AND_CONCAT = "EMBEDDING_AVERAGE_AND_CONCAT" + CONCATENATE_ALL = "EMBEDDING_CONCATENATE_ALL" + + +class CategoricalVariableNet(nn.Module): + def __init__( + self, + categorical_vocabulary_sizes: List[int], + categorical_embedding_dims: Optional[Union[List[int], int]] = None, + text_embedding_dim: Optional[int] = None, + ): + super().__init__() + + self.categorical_vocabulary_sizes = categorical_vocabulary_sizes + self.categorical_embedding_dims = categorical_embedding_dims + self.text_embedding_dim = text_embedding_dim + + self._validate_categorical_inputs() + assert isinstance( + self.forward_type, CategoricalForwardType + ), "forward_type must be set after validation" + assert isinstance(self.output_dim, int), "output_dim must be set as int after validation" + + self.categorical_embedding_layers = {} + + for var_idx, num_rows in enumerate(self.categorical_vocabulary_sizes): + emb_layer = nn.Embedding( + num_embeddings=num_rows, + embedding_dim=self.categorical_embedding_dims[var_idx], + ) + self.categorical_embedding_layers[var_idx] = emb_layer + setattr(self, f"categorical_embedding_{var_idx}", emb_layer) + + def forward(self, categorical_vars_tensor: torch.Tensor) -> torch.Tensor: + cat_embeds = self._get_cat_embeds(categorical_vars_tensor) + if self.forward_type == CategoricalForwardType.SUM_TO_TEXT: + x_combined = torch.stack(cat_embeds, dim=0).sum(dim=0) # (bs, text_embed_dim) + elif self.forward_type == CategoricalForwardType.AVERAGE_AND_CONCAT: + x_combined = torch.stack(cat_embeds, dim=0).mean(dim=0) # (bs, embed_dim) + elif self.forward_type == CategoricalForwardType.CONCATENATE_ALL: + x_combined = torch.cat(cat_embeds, dim=1) # (bs, sum of all cat embed dims) + else: + raise ValueError(f"Unknown forward type: {self.forward_type}") + + assert ( + x_combined.dim() == 2 + ), "Output combined tensor must be 2-dimensional (batch_size, embed_dim)" + assert x_combined.size(1) == self.output_dim + + return x_combined + + def _get_cat_embeds(self, categorical_vars_tensor: torch.Tensor): + if categorical_vars_tensor.dtype != torch.long: + categorical_vars_tensor = categorical_vars_tensor.to(torch.long) + cat_embeds = [] + + for i, embed_layer in self.categorical_embedding_layers.items(): + cat_var_tensor = categorical_vars_tensor[:, i] + + # Check if categorical values are within valid range + vocab_size = embed_layer.num_embeddings + max_val = cat_var_tensor.max().item() + min_val = cat_var_tensor.min().item() + + if max_val >= vocab_size or min_val < 0: + raise ValueError( + f"Categorical feature {i}: values range [{min_val}, {max_val}] exceed vocabulary size {vocab_size}." + ) + + cat_embed = embed_layer(cat_var_tensor) + if cat_embed.dim() > 2: + cat_embed = cat_embed.squeeze(1) + cat_embeds.append(cat_embed) + + return cat_embeds + + def _validate_categorical_inputs(self): + categorical_vocabulary_sizes = self.categorical_vocabulary_sizes + categorical_embedding_dims = self.categorical_embedding_dims + + if not isinstance(categorical_vocabulary_sizes, list): + raise TypeError("categorical_vocabulary_sizes must be a list of int") + + if isinstance(categorical_embedding_dims, list): + if len(categorical_vocabulary_sizes) != len(categorical_embedding_dims): + raise ValueError( + "Categorical vocabulary sizes and their embedding dimensions must have the same length" + ) + + num_categorical_features = len(categorical_vocabulary_sizes) + + # "Transform" embedding dims into a suitable list, or stay None + if categorical_embedding_dims is not None: + if isinstance(categorical_embedding_dims, int): + self.forward_type = CategoricalForwardType.AVERAGE_AND_CONCAT + self.output_dim = categorical_embedding_dims + categorical_embedding_dims = [categorical_embedding_dims] * num_categorical_features + + elif isinstance(categorical_embedding_dims, list): + self.forward_type = CategoricalForwardType.CONCATENATE_ALL + self.output_dim = sum(categorical_embedding_dims) + else: + raise TypeError("categorical_embedding_dims must be an int, a list of int or None") + else: + if self.text_embedding_dim is None: + raise ValueError( + "If categorical_embedding_dims is None, text_embedding_dim must be provided" + ) + self.forward_type = CategoricalForwardType.SUM_TO_TEXT + self.output_dim = self.text_embedding_dim + categorical_embedding_dims = [self.text_embedding_dim] * num_categorical_features + + assert ( + isinstance(categorical_embedding_dims, list) or categorical_embedding_dims is None + ), "categorical_embedding_dims must be a list of int at this point" + + self.categorical_vocabulary_sizes = categorical_vocabulary_sizes + self.categorical_embedding_dims = categorical_embedding_dims + self.num_categorical_features = num_categorical_features diff --git a/torchTextClassifiers/model/components/classification_head.py b/torchTextClassifiers/model/components/classification_head.py new file mode 100644 index 0000000..4297d27 --- /dev/null +++ b/torchTextClassifiers/model/components/classification_head.py @@ -0,0 +1,43 @@ +from typing import Optional + +import torch +from torch import nn + + +class ClassificationHead(nn.Module): + def __init__( + self, + input_dim: Optional[int] = None, + num_classes: Optional[int] = None, + net: Optional[nn.Module] = None, + ): + super().__init__() + if net is not None: + self.net = net + self.input_dim = net.in_features + self.num_classes = net.out_features + else: + assert ( + input_dim is not None and num_classes is not None + ), "Either net or both input_dim and num_classes must be provided." + self.net = nn.Linear(input_dim, num_classes) + self.input_dim, self.num_classes = self._get_linear_input_output_dims(self.net) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.net(x) + + @staticmethod + def _get_linear_input_output_dims(module: nn.Module): + """ + Returns (input_dim, output_dim) for any module containing Linear layers. + Works for Linear, Sequential, or nested models. + """ + # Collect all Linear layers recursively + linears = [m for m in module.modules() if isinstance(m, nn.Linear)] + + if not linears: + raise ValueError("No Linear layers found in the given module.") + + input_dim = linears[0].in_features + output_dim = linears[-1].out_features + return input_dim, output_dim diff --git a/torchTextClassifiers/model/components/text_embedder.py b/torchTextClassifiers/model/components/text_embedder.py new file mode 100644 index 0000000..e9703b7 --- /dev/null +++ b/torchTextClassifiers/model/components/text_embedder.py @@ -0,0 +1,220 @@ +import math +from dataclasses import dataclass +from typing import Optional + +import torch +from torch import nn + +from torchTextClassifiers.model.components.attention import AttentionConfig, Block, norm + + +@dataclass +class TextEmbedderConfig: + vocab_size: int + embedding_dim: int + padding_idx: int + attention_config: Optional[AttentionConfig] = None + + +class TextEmbedder(nn.Module): + def __init__(self, text_embedder_config: TextEmbedderConfig): + super().__init__() + + self.config = text_embedder_config + + self.attention_config = text_embedder_config.attention_config + if self.attention_config is not None: + self.attention_config.n_embd = text_embedder_config.embedding_dim + + self.vocab_size = text_embedder_config.vocab_size + self.embedding_dim = text_embedder_config.embedding_dim + self.padding_idx = text_embedder_config.padding_idx + + self.embedding_layer = nn.Embedding( + embedding_dim=self.embedding_dim, + num_embeddings=self.vocab_size, + padding_idx=self.padding_idx, + ) + + if self.attention_config is not None: + self.transformer = nn.ModuleDict( + { + "h": nn.ModuleList( + [ + Block(self.attention_config, layer_idx) + for layer_idx in range(self.attention_config.n_layers) + ] + ), + } + ) + + head_dim = self.attention_config.n_embd // self.attention_config.n_head + + if head_dim * self.attention_config.n_head != self.attention_config.n_embd: + raise ValueError("embedding_dim must be divisible by n_head.") + + if self.attention_config.positional_encoding: + if head_dim % 2 != 0: + raise ValueError( + "embedding_dim / n_head must be even for rotary positional embeddings." + ) + + if self.attention_config.sequence_len is None: + raise ValueError( + "sequence_len must be specified in AttentionConfig when positional_encoding is True." + ) + + self.rotary_seq_len = self.attention_config.sequence_len * 10 + cos, sin = self._precompute_rotary_embeddings( + seq_len=self.rotary_seq_len, head_dim=head_dim + ) + + self.register_buffer( + "cos", cos, persistent=False + ) # persistent=False means it's not saved to the checkpoint + self.register_buffer("sin", sin, persistent=False) + + def init_weights(self): + self.apply(self._init_weights) + + # zero out c_proj weights in all blocks + if self.attention_config is not None: + for block in self.transformer.h: + torch.nn.init.zeros_(block.mlp.c_proj.weight) + torch.nn.init.zeros_(block.attn.c_proj.weight) + # init the rotary embeddings + head_dim = self.attention_config.n_embd // self.attention_config.n_head + cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim) + self.cos, self.sin = cos, sin + # Cast the embeddings from fp32 to bf16: optim can tolerate it and it saves memory: both in the model and the activations + if self.embedding_layer.weight.device.type == "cuda": + self.embedding_layer.to(dtype=torch.bfloat16) + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + # https://arxiv.org/pdf/2310.17813 + fan_out = module.weight.size(0) + fan_in = module.weight.size(1) + std = 1.0 / math.sqrt(fan_in) * min(1.0, math.sqrt(fan_out / fan_in)) + torch.nn.init.normal_(module.weight, mean=0.0, std=std) + if module.bias is not None: + torch.nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + torch.nn.init.normal_(module.weight, mean=0.0, std=1.0) + + def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: + """Converts input token IDs to their corresponding embeddings.""" + + encoded_text = input_ids # clearer name + if encoded_text.dtype != torch.long: + encoded_text = encoded_text.to(torch.long) + + batch_size, seq_len = encoded_text.shape + batch_size_check, seq_len_check = attention_mask.shape + + if batch_size != batch_size_check or seq_len != seq_len_check: + raise ValueError( + f"Input IDs and attention mask must have the same batch size and sequence length. " + f"Got input_ids shape {encoded_text.shape} and attention_mask shape {attention_mask.shape}." + ) + + token_embeddings = self.embedding_layer( + encoded_text + ) # (batch_size, seq_len, embedding_dim) + + token_embeddings = norm(token_embeddings) + + if self.attention_config is not None: + if self.attention_config.positional_encoding: + cos_sin = self.cos[:, :seq_len], self.sin[:, :seq_len] + else: + cos_sin = None + + for block in self.transformer.h: + token_embeddings = block(token_embeddings, cos_sin) + + token_embeddings = norm(token_embeddings) + + text_embedding = self._get_sentence_embedding( + token_embeddings=token_embeddings, attention_mask=attention_mask + ) + + return text_embedding + + def _get_sentence_embedding( + self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor + ) -> torch.Tensor: + """ + Compute sentence embedding from embedded tokens - "remove" second dimension. + + Args (output from dataset collate_fn): + token_embeddings (torch.Tensor[Long]), shape (batch_size, seq_len, embedding_dim): Tokenized + padded text + attention_mask (torch.Tensor[Long]), shape (batch_size, seq_len): Attention mask indicating non-pad tokens + Returns: + torch.Tensor: Sentence embeddings, shape (batch_size, embedding_dim) + """ + + # average over non-pad token embeddings + # attention mask has 1 for non-pad tokens and 0 for pad token positions + + # mask pad-tokens + + if self.attention_config is not None: + if self.attention_config.aggregation_method is not None: + if self.attention_config.aggregation_method == "first": + return token_embeddings[:, 0, :] + elif self.attention_config.aggregation_method == "last": + lengths = attention_mask.sum(dim=1).clamp(min=1) # last non-pad token index + 1 + return token_embeddings[ + torch.arange(token_embeddings.size(0)), + lengths - 1, + :, + ] + else: + if self.attention_config.aggregation_method != "mean": + raise ValueError( + f"Unknown aggregation method: {self.attention_config.aggregation_method}. Supported methods are 'mean', 'first', 'last'." + ) + + assert self.attention_config is None or self.attention_config.aggregation_method == "mean" + + mask = attention_mask.unsqueeze(-1).float() # (batch_size, seq_len, 1) + masked_embeddings = token_embeddings * mask # (batch_size, seq_len, embedding_dim) + + sentence_embedding = masked_embeddings.sum(dim=1) / mask.sum(dim=1).clamp( + min=1.0 + ) # avoid division by zero + + sentence_embedding = torch.nan_to_num(sentence_embedding, 0.0) + + return sentence_embedding + + def __call__(self, *args, **kwargs): + out = super().__call__(*args, **kwargs) + if out.dim() != 2: + raise ValueError( + f"Output of {self.__class__.__name__}.forward must be 2D " + f"(got shape {tuple(out.shape)})" + ) + return out + + def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None): + # autodetect the device from model embeddings + if device is None: + device = next(self.parameters()).device + + # stride the channels + channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device) + inv_freq = 1.0 / (base ** (channel_range / head_dim)) + # stride the time steps + t = torch.arange(seq_len, dtype=torch.float32, device=device) + # calculate the rotation frequencies at each (time, channel) pair + freqs = torch.outer(t, inv_freq) + cos, sin = freqs.cos(), freqs.sin() + cos, sin = cos.bfloat16(), sin.bfloat16() # keep them in bfloat16 + cos, sin = ( + cos[None, :, None, :], + sin[None, :, None, :], + ) # add batch and head dims for later broadcasting + + return cos, sin diff --git a/torchTextClassifiers/model/lightning.py b/torchTextClassifiers/model/lightning.py new file mode 100644 index 0000000..e432082 --- /dev/null +++ b/torchTextClassifiers/model/lightning.py @@ -0,0 +1,166 @@ +import pytorch_lightning as pl +import torch +from torchmetrics import Accuracy + +from .model import TextClassificationModel + +# ============================================================================ +# PyTorch Lightning Module +# ============================================================================ + + +class TextClassificationModule(pl.LightningModule): + """Pytorch Lightning Module for FastTextModel.""" + + def __init__( + self, + model: TextClassificationModel, + loss, + optimizer, + optimizer_params, + scheduler, + scheduler_params, + scheduler_interval="epoch", + **kwargs, + ): + """ + Initialize FastTextModule. + + Args: + model: Model. + loss: Loss + optimizer: Optimizer + optimizer_params: Optimizer parameters. + scheduler: Scheduler. + scheduler_params: Scheduler parameters. + scheduler_interval: Scheduler interval. + """ + super().__init__() + self.save_hyperparameters(ignore=["model", "loss"]) + + self.model = model + self.loss = loss + self.accuracy_fn = Accuracy(task="multiclass", num_classes=self.model.num_classes) + self.optimizer = optimizer + self.optimizer_params = optimizer_params + self.scheduler = scheduler + self.scheduler_params = scheduler_params + self.scheduler_interval = scheduler_interval + + def forward(self, batch) -> torch.Tensor: + """ + Perform forward-pass. + + Args: + batch (List[torch.LongTensor]): Batch to perform forward-pass on. + + Returns (torch.Tensor): Prediction. + """ + return self.model( + input_ids=batch["input_ids"], + attention_mask=batch["attention_mask"], + categorical_vars=batch.get("categorical_vars", None), + ) + + def training_step(self, batch, batch_idx: int) -> torch.Tensor: + """ + Training step. + + Args: + batch (List[torch.LongTensor]): Training batch. + batch_idx (int): Batch index. + + Returns (torch.Tensor): Loss tensor. + """ + + targets = batch["labels"] + + outputs = self.forward(batch) + loss = self.loss(outputs, targets) + self.log("train_loss", loss, on_epoch=True, on_step=True, prog_bar=True) + accuracy = self.accuracy_fn(outputs, targets) + self.log("train_accuracy", accuracy, on_epoch=True, on_step=False, prog_bar=True) + + torch.cuda.empty_cache() + + return loss + + def validation_step(self, batch, batch_idx: int): + """ + Validation step. + + Args: + batch (List[torch.LongTensor]): Validation batch. + batch_idx (int): Batch index. + + Returns (torch.Tensor): Loss tensor. + """ + targets = batch["labels"] + + outputs = self.forward(batch) + loss = self.loss(outputs, targets) + self.log("val_loss", loss, on_epoch=True, on_step=False, prog_bar=True, sync_dist=True) + + accuracy = self.accuracy_fn(outputs, targets) + self.log("val_accuracy", accuracy, on_epoch=True, on_step=False, prog_bar=True) + return loss + + def test_step(self, batch, batch_idx: int): + """ + Test step. + + Args: + batch (List[torch.LongTensor]): Test batch. + batch_idx (int): Batch index. + + Returns (torch.Tensor): Loss tensor. + """ + targets = batch["labels"] + + outputs = self.forward(batch) + loss = self.loss(outputs, targets) + + accuracy = self.accuracy_fn(outputs, targets) + + return loss, accuracy + + def predict_step(self, batch, batch_idx: int = 0, dataloader_idx: int = 0): + """ + Prediction step. + + Args: + batch (List[torch.LongTensor]): Prediction batch. + batch_idx (int): Batch index. + dataloader_idx (int): Dataloader index. + + Returns (torch.Tensor): Predictions. + """ + outputs = self.forward(batch) + return outputs + + def configure_optimizers(self): + """ + Configure optimizer for Pytorch lighting. + + Returns: Optimizer and scheduler for pytorch lighting. + """ + optimizer = self.optimizer(self.parameters(), **self.optimizer_params) + + if self.scheduler is None: + return optimizer + + # Only use scheduler if it's not ReduceLROnPlateau or if we can ensure val_loss is available + # For complex training setups, sometimes val_loss is not available every epoch + if hasattr(self.scheduler, "__name__") and "ReduceLROnPlateau" in self.scheduler.__name__: + # For ReduceLROnPlateau, use train_loss as it's always available + scheduler = self.scheduler(optimizer, **self.scheduler_params) + scheduler_config = { + "scheduler": scheduler, + "monitor": "train_loss", + "interval": self.scheduler_interval, + } + return [optimizer], [scheduler_config] + else: + # For other schedulers (StepLR, etc.), no monitoring needed + scheduler = self.scheduler(optimizer, **self.scheduler_params) + return [optimizer], [scheduler] diff --git a/torchTextClassifiers/model/model.py b/torchTextClassifiers/model/model.py new file mode 100644 index 0000000..d9cffbf --- /dev/null +++ b/torchTextClassifiers/model/model.py @@ -0,0 +1,151 @@ +"""FastText model components. + +This module contains the PyTorch model, Lightning module, and dataset classes +for FastText classification. Consolidates what was previously in pytorch_model.py, +lightning_module.py, and dataset.py. +""" + +import logging +from typing import Annotated, Optional + +import torch +from torch import nn + +from torchTextClassifiers.model.components import ( + CategoricalForwardType, + CategoricalVariableNet, + ClassificationHead, + TextEmbedder, +) + +logger = logging.getLogger(__name__) + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + handlers=[logging.StreamHandler()], +) + + +# ============================================================================ +# PyTorch Model + +# It takes PyTorch tensors as input (not raw text!), +# and it outputs raw not-softmaxed logits, not predictions +# ============================================================================ + + +class TextClassificationModel(nn.Module): + """FastText Pytorch Model.""" + + def __init__( + self, + classification_head: ClassificationHead, + text_embedder: Optional[TextEmbedder] = None, + categorical_variable_net: Optional[CategoricalVariableNet] = None, + ): + """ + Constructor for the FastTextModel class. + + Args: + classification_head (ClassificationHead): The classification head module. + text_embedder (Optional[TextEmbedder]): The text embedding module. + If not provided, assumes that input text is already embedded (as tensors) and directly passed to the classification head. + categorical_variable_net (Optional[CategoricalVariableNet]): The categorical variable network module. + If not provided, assumes no categorical variables are used. + """ + super().__init__() + + self.text_embedder = text_embedder + + self.categorical_variable_net = categorical_variable_net + if not self.categorical_variable_net: + logger.info("🔹 No categorical variable network provided; using only text embeddings.") + + self.classification_head = classification_head + + self._validate_component_connections() + + self.num_classes = self.classification_head.num_classes + + torch.nn.init.zeros_(self.classification_head.net.weight) + if self.text_embedder is not None: + self.text_embedder.init_weights() + + def _validate_component_connections(self): + def _check_text_categorical_connection(self, text_embedder, cat_var_net): + if cat_var_net.forward_type == CategoricalForwardType.SUM_TO_TEXT: + if text_embedder.embedding_dim != cat_var_net.output_dim: + raise ValueError( + "Text embedding dimension must match categorical variable embedding dimension." + ) + self.expected_classification_head_input_dim = text_embedder.embedding_dim + else: + self.expected_classification_head_input_dim = ( + text_embedder.embedding_dim + cat_var_net.output_dim + ) + + if self.text_embedder: + if self.categorical_variable_net: + _check_text_categorical_connection( + self, self.text_embedder, self.categorical_variable_net + ) + else: + self.expected_classification_head_input_dim = self.text_embedder.embedding_dim + + if self.expected_classification_head_input_dim != self.classification_head.input_dim: + raise ValueError( + "Classification head input dimension does not match expected dimension from text embedder and categorical variable net." + ) + else: + logger.warning( + "⚠️ No text embedder provided; assuming input text is already embedded or vectorized. Take care that the classification head input dimension matches the input text dimension." + ) + + def forward( + self, + input_ids: Annotated[torch.Tensor, "batch seq_len"], + attention_mask: Annotated[torch.Tensor, "batch seq_len"], + categorical_vars: Annotated[torch.Tensor, "batch num_cats"], + **kwargs, + ) -> torch.Tensor: + """ + Memory-efficient forward pass implementation. + + Args: output from dataset collate_fn + input_ids (torch.Tensor[Long]), shape (batch_size, seq_len): Tokenized + padded text + attention_mask (torch.Tensor[int]), shape (batch_size, seq_len): Attention mask indicating non-pad tokens + categorical_vars (torch.Tensor[Long]): Additional categorical features, (batch_size, num_categorical_features) + + Returns: + torch.Tensor: Model output scores for each class - shape (batch_size, num_classes) + Raw, not softmaxed. + """ + encoded_text = input_ids # clearer name + if self.text_embedder is None: + x_text = encoded_text.float() + else: + x_text = self.text_embedder(input_ids=encoded_text, attention_mask=attention_mask) + + if self.categorical_variable_net: + x_cat = self.categorical_variable_net(categorical_vars) + + if ( + self.categorical_variable_net.forward_type + == CategoricalForwardType.AVERAGE_AND_CONCAT + or self.categorical_variable_net.forward_type + == CategoricalForwardType.CONCATENATE_ALL + ): + x_combined = torch.cat((x_text, x_cat), dim=1) + else: + assert ( + self.categorical_variable_net.forward_type == CategoricalForwardType.SUM_TO_TEXT + ) + x_combined = x_text + x_cat + else: + x_combined = x_text + + logits = self.classification_head(x_combined) + + return logits diff --git a/torchTextClassifiers/tokenizers/WordPiece.py b/torchTextClassifiers/tokenizers/WordPiece.py new file mode 100644 index 0000000..280d11d --- /dev/null +++ b/torchTextClassifiers/tokenizers/WordPiece.py @@ -0,0 +1,92 @@ +import logging +import os +from typing import List, Optional + +from torchTextClassifiers.tokenizers import HAS_HF, HuggingFaceTokenizer + +if not HAS_HF: + raise ImportError( + "The HuggingFace dependencies are needed to use this tokenizer. Please run 'uv add torchTextClassifiers --extra huggingface." + ) +else: + from tokenizers import ( + Tokenizer, + decoders, + models, + normalizers, + pre_tokenizers, + processors, + trainers, + ) + from transformers import PreTrainedTokenizerFast + +logger = logging.getLogger(__name__) + + +class WordPieceTokenizer(HuggingFaceTokenizer): + def __init__(self, vocab_size: int, trained: bool = False, output_dim: Optional[int] = None): + """Largely inspired by https://huggingface.co/learn/llm-course/chapter6/8""" + + super().__init__(vocab_size=vocab_size, output_dim=output_dim) + + self.unk_token = "[UNK]" + self.pad_token = "[PAD]" + self.cls_token = "[CLS]" + self.sep_token = "[SEP]" + self.special_tokens = [ + self.unk_token, + self.pad_token, + self.cls_token, + self.sep_token, + ] + self.vocab_size = vocab_size + self.context_size = output_dim + + self.tokenizer = Tokenizer(models.WordPiece(unk_token=self.unk_token)) + + self.tokenizer.normalizer = normalizers.BertNormalizer( + lowercase=True + ) # NFD, lowercase, strip accents - BERT style + + self.tokenizer.pre_tokenizer = ( + pre_tokenizers.BertPreTokenizer() + ) # split on whitespace and punctuation - BERT style + self.trained = trained + + def _post_training(self): + if not self.trained: + raise RuntimeError( + "Tokenizer must be trained before applying post-training configurations." + ) + + self.tokenizer.post_processor = processors.BertProcessing( + (self.cls_token, self.tokenizer.token_to_id(self.cls_token)), + (self.sep_token, self.tokenizer.token_to_id(self.sep_token)), + ) + self.tokenizer.decoder = decoders.WordPiece(prefix="##") + self.padding_idx = self.tokenizer.token_to_id("[PAD]") + self.tokenizer.enable_padding(pad_id=self.padding_idx, pad_token="[PAD]") + + self.tokenizer = PreTrainedTokenizerFast(tokenizer_object=self.tokenizer) + self.vocab_size = len(self.tokenizer) + + def train( + self, training_corpus: List[str], save_path: str = None, filesystem=None, s3_save_path=None + ): + trainer = trainers.WordPieceTrainer( + vocab_size=self.vocab_size, + special_tokens=self.special_tokens, + ) + self.tokenizer.train_from_iterator(training_corpus, trainer=trainer) + self.trained = True + self._post_training() + + if save_path: + self.tokenizer.save(save_path) + logger.info(f"💾 Tokenizer saved at {save_path}") + if filesystem and s3_save_path: + parent_dir = os.path.dirname(save_path) + if not filesystem.exists(parent_dir): + filesystem.mkdirs(parent_dir) + filesystem.put(save_path, s3_save_path) + logger.info(f"💾 Tokenizer uploaded to S3 at {s3_save_path}") diff --git a/torchTextClassifiers/tokenizers/__init__.py b/torchTextClassifiers/tokenizers/__init__.py new file mode 100644 index 0000000..ecba309 --- /dev/null +++ b/torchTextClassifiers/tokenizers/__init__.py @@ -0,0 +1,10 @@ +from .base import ( + HAS_HF as HAS_HF, +) +from .base import BaseTokenizer as BaseTokenizer +from .base import ( + HuggingFaceTokenizer as HuggingFaceTokenizer, +) +from .base import TokenizerOutput as TokenizerOutput +from .ngram import NGramTokenizer as NGramTokenizer +from .WordPiece import WordPieceTokenizer as WordPieceTokenizer diff --git a/torchTextClassifiers/tokenizers/base.py b/torchTextClassifiers/tokenizers/base.py new file mode 100644 index 0000000..dee5546 --- /dev/null +++ b/torchTextClassifiers/tokenizers/base.py @@ -0,0 +1,205 @@ +from abc import ABC, abstractmethod +from dataclasses import asdict, dataclass +from typing import Any, Dict, List, Optional, Union + +import numpy as np +import torch + +try: + from tokenizers import Tokenizer + from transformers import AutoTokenizer, PreTrainedTokenizerFast + + HAS_HF = True +except ImportError: + HAS_HF = False + + +@dataclass +class TokenizerOutput: + input_ids: torch.Tensor # shape: (batch_size, seq_len) + attention_mask: torch.Tensor # shape: (batch_size, seq_len) + offset_mapping: Optional[torch.Tensor] = None # shape: (batch_size, seq_len, 2) + word_ids: Optional[np.ndarray] = None # shape: (batch_size, seq_len) + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "TokenizerOutput": + return cls(**data) + + def __post_init__(self): + # --- Basic type checks --- + if not isinstance(self.input_ids, torch.Tensor): + raise TypeError(f"token_ids must be a torch.Tensor, got {type(self.input_ids)}") + if not isinstance(self.attention_mask, torch.Tensor): + raise TypeError( + f"attention_mask must be a torch.Tensor, got {type(self.attention_mask)}" + ) + if self.offset_mapping is not None and not isinstance(self.offset_mapping, torch.Tensor): + raise TypeError( + f"offset_mapping must be a torch.Tensor or None, got {type(self.offset_mapping)}" + ) + if self.word_ids is not None and not isinstance(self.word_ids, np.ndarray): + raise TypeError(f"word_ids must be a numpy.ndarray or None, got {type(self.word_ids)}") + + # --- Shape consistency checks --- + if self.input_ids.shape != self.attention_mask.shape: + raise ValueError( + f"Shape mismatch: token_ids {self.token_ids.shape} and attention_mask {self.attention_mask.shape}" + ) + + if self.offset_mapping is not None: + expected_shape = (*self.input_ids.shape, 2) + if self.offset_mapping.shape != expected_shape: + raise ValueError( + f"offset_mapping should have shape {expected_shape}, got {self.offset_mapping.shape}" + ) + + if self.word_ids is not None: + if self.word_ids.shape != self.input_ids.shape: + raise ValueError( + f"word_ids should have shape {self.input_ids.shape}, got {self.word_ids.shape}" + ) + + +class BaseTokenizer(ABC): + def __init__( + self, + vocab_size: int, + padding_idx: int, + output_vectorized: bool = False, + output_dim: Optional[int] = None, + ): + """ + Base class for tokenizers. + Args: + vocab_size (int): Size of the vocabulary. + output_vectorized (bool): Whether the tokenizer outputs vectorized tokens. + True for instance for a TF-IDF tokenizer. + """ + + self.vocab_size = vocab_size + self.output_vectorized = output_vectorized + self.output_dim = output_dim + self.padding_idx = padding_idx + if self.output_vectorized: + if output_dim is None: + raise ValueError( + "Tokenizer's output_dim must be provided if output_vectorized is True." + ) + + @abstractmethod + def tokenize(self, text: Union[str, List[str]]) -> TokenizerOutput: + """Tokenizes the raw input text into a list of tokens.""" + pass + + def __len__(self): + return self.vocab_size + + def __repr__(self): + return f"{self.__class__.__name__}(vocab_size={self.vocab_size}, output_vectorized={self.output_vectorized}, output_dim={self.output_dim})" + + def __call__(self, text: Union[str, List[str]], **kwargs) -> list: + return self.tokenize(text, **kwargs) + + +class HuggingFaceTokenizer(BaseTokenizer): + def __init__( + self, + vocab_size: int, + output_dim: Optional[int] = None, + padding_idx: Optional[int] = None, + trained: bool = False, + ): + super().__init__( + vocab_size, output_vectorized=False, output_dim=output_dim, padding_idx=padding_idx + ) # it outputs token ids and not vectors + + self.trained = trained + self.tokenizer = None + self.padding_idx = padding_idx + self.output_dim = output_dim # constant context size for all batch + + def tokenize( + self, + text: Union[str, List[str]], + return_offsets_mapping: Optional[bool] = False, + return_word_ids: Optional[bool] = False, + ) -> list: + if not self.trained: + raise RuntimeError("Tokenizer must be trained before tokenization.") + + # Pad to longest sequence if no output_dim is specified + padding = True if self.output_dim is None else "max_length" + truncation = True if self.output_dim is not None else False + + tokenize_output = self.tokenizer( + text, + padding=padding, + return_tensors="pt", + truncation=truncation, + max_length=self.output_dim, + return_offsets_mapping=return_offsets_mapping, + ) # method from PreTrainedTokenizerFast + + encoded_text = tokenize_output["input_ids"] + + if return_word_ids: + word_ids = np.array([tokenize_output.word_ids(i) for i in range(len(encoded_text))]) + else: + word_ids = None + + return TokenizerOutput( + input_ids=encoded_text, + attention_mask=tokenize_output["attention_mask"], + offset_mapping=tokenize_output.get("offset_mapping", None), + word_ids=word_ids, + ) + + @classmethod + def load_from_pretrained(cls, tokenizer_name: str, output_dim: Optional[int] = None): + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + padding_idx = tokenizer.pad_token_id + instance = cls( + vocab_size=len(tokenizer), trained=True, padding_idx=padding_idx, output_dim=output_dim + ) + instance.tokenizer = tokenizer + return instance + + @classmethod + def load(cls, load_path: str): + loaded_tokenizer = PreTrainedTokenizerFast(tokenizer_file=load_path) + instance = cls(vocab_size=len(loaded_tokenizer), trained=True) + instance.tokenizer = loaded_tokenizer + # instance._post_training() + return instance + + @classmethod + def load_from_s3(cls, s3_path: str, filesystem): + if filesystem.exists(s3_path) is False: + raise FileNotFoundError( + f"Tokenizer not found at {s3_path}. Please train it first (see src/train_tokenizers)." + ) + + with filesystem.open(s3_path, "rb") as f: + json_str = f.read().decode("utf-8") + + tokenizer_obj = Tokenizer.from_str(json_str) + tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_obj) + instance = cls(vocab_size=len(tokenizer), trained=True) + instance.tokenizer = tokenizer + instance._post_training() + return instance + + def train(self, *args, **kwargs): + raise NotImplementedError( + "This tokenizer cannot be trained directly. " + "Load it from pretrained or implement train() in a subclass." + ) + + def _post_training(self): + raise NotImplementedError("_post_training() not implemented for HuggingFaceTokenizer.") + + def __repr__(self): + return f"{self.__class__.__name__} \n HuggingFace tokenizer: {self.tokenizer.__repr__()}" diff --git a/torchTextClassifiers/tokenizers/ngram.py b/torchTextClassifiers/tokenizers/ngram.py new file mode 100644 index 0000000..ed0d8cb --- /dev/null +++ b/torchTextClassifiers/tokenizers/ngram.py @@ -0,0 +1,472 @@ +import json +import re +import unicodedata +from functools import lru_cache +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch + +from torchTextClassifiers.tokenizers import BaseTokenizer, TokenizerOutput + +# ============================================================================ +# Optimized normalization +# ============================================================================ + +_fasttext_non_alnum = re.compile(r"[^a-z0-9]+") +_fasttext_multi_space = re.compile(r"\s+") + +# Pre-compile translation table for faster character removal +_COMBINING_MARKS = {c: None for c in range(0x0300, 0x0370)} + + +@lru_cache(maxsize=10000) +def _clean_single_text_cached(text: str) -> str: + """Cached version of text cleaning - major speedup for repeated texts.""" + t = text.lower() + t = unicodedata.normalize("NFKD", t) + # Faster: use translate() instead of list comprehension + t = t.translate(_COMBINING_MARKS) + t = _fasttext_non_alnum.sub(" ", t) + t = _fasttext_multi_space.sub(" ", t) + return t.strip() + + +def clean_text_feature(texts: List[str]) -> List[str]: + """Vectorized text cleaning with caching.""" + return [_clean_single_text_cached(t) for t in texts] + + +# ============================================================================ +# Optimized hash function +# ============================================================================ + + +def fast_hash(s: str) -> int: + """FNV-1a hash - simple and fast.""" + h = 2166136261 + for c in s: + h ^= ord(c) + h = (h * 16777619) & 0xFFFFFFFF + return h + + +# ============================================================================ +# Pre-computed subword cache +# ============================================================================ + + +class SubwordCache: + """Aggressive pre-computation cache for subwords.""" + + def __init__( + self, + word_to_id: dict, + min_n: int, + max_n: int, + num_tokens: int, + nwords: int, + unk_token_id: int, + ): + self.cache = {} + self.word_to_id = word_to_id + self.min_n = min_n + self.max_n = max_n + self.num_tokens = num_tokens + self.nwords = nwords + self.unk_token_id = unk_token_id + + # Pre-compute for all vocabulary words + self._precompute_vocab() + + def _precompute_vocab(self): + """Pre-compute subwords for entire vocabulary.""" + for word, word_id in self.word_to_id.items(): + self.cache[word] = self._compute_subwords(word, word_id) + + def _compute_subwords(self, word: str, word_id: Optional[int] = None) -> List[int]: + """Compute subword indices for a word.""" + indices = [] + + # Add word token if in vocab + if word_id is not None: + indices.append(word_id) + + # Extract character n-grams + word_tagged = f"<{word}>" + L = len(word_tagged) + + for n in range(self.min_n, self.max_n + 1): + for i in range(L - n + 1): + ngram = word_tagged[i : i + n] + if ngram != word and ngram != word_tagged: + bucket_idx = fast_hash(ngram) % self.num_tokens + indices.append(3 + self.nwords + bucket_idx) + + return indices if indices else [self.unk_token_id] + + def get(self, word: str) -> List[int]: + """Get subwords with on-demand computation for OOV words.""" + if word not in self.cache: + word_id = self.word_to_id.get(word) + self.cache[word] = self._compute_subwords(word, word_id) + return self.cache[word] + + +# ============================================================================ +# Vectorized encoding with optional metadata +# ============================================================================ + + +def encode_batch_vectorized( + sentences: List[str], + subword_cache: SubwordCache, + eos_token_id: int, + pad_token_id: int, + max_length: Optional[int] = None, + truncation: bool = False, + return_offsets_mapping: bool = False, + return_word_ids: bool = False, + force_max_length: bool = False, +) -> Tuple[torch.Tensor, torch.Tensor, Optional[List], Optional[List]]: + """ + Vectorized batch encoding - processes all sentences together. + Returns padded tensors directly, with optional offset mappings and word IDs. + + Args: + force_max_length: If True and max_length is set, always return tensors of size max_length + """ + all_ids = [] + all_offsets = [] if return_offsets_mapping else None + all_word_ids = [] if return_word_ids else None + max_len = 0 + + # First pass: encode all sentences + for sentence in sentences: + ids = [] + offsets = [] if return_offsets_mapping else None + word_ids = [] if return_word_ids else None + + words = sentence.split() + char_offset = 0 + + for word_idx, word in enumerate(words): + # Find the actual position of this word in the original sentence + word_start = sentence.find(word, char_offset) + word_end = word_start + len(word) + char_offset = word_end + + # Get subword tokens for this word + subword_tokens = subword_cache.get(word) + + for token_id in subword_tokens: + ids.append(token_id) + + if return_offsets_mapping: + # All subword tokens of a word map to the word's character span + offsets.append((word_start, word_end)) + + if return_word_ids: + # All subword tokens of a word get the same word_id + word_ids.append(word_idx) + + # Add EOS token + ids.append(eos_token_id) + if return_offsets_mapping: + offsets.append((len(sentence), len(sentence))) # EOS has no span + if return_word_ids: + word_ids.append(None) # EOS is not part of any word + + # Truncate if needed + if truncation and max_length and len(ids) > max_length: + ids = ids[:max_length] + if return_offsets_mapping: + offsets = offsets[:max_length] + if return_word_ids: + word_ids = word_ids[:max_length] + + all_ids.append(ids) + if return_offsets_mapping: + all_offsets.append(offsets) + if return_word_ids: + all_word_ids.append(word_ids) + max_len = max(max_len, len(ids)) + + # Determine final sequence length + if force_max_length and max_length: + # Always use max_length when force_max_length is True + seq_len = max_length + elif max_length and not truncation: + seq_len = min(max_len, max_length) + elif max_length: + seq_len = max_length + else: + seq_len = max_len + + # Pre-allocate tensors + batch_size = len(sentences) + input_ids = torch.full((batch_size, seq_len), pad_token_id, dtype=torch.long) + attention_mask = torch.zeros((batch_size, seq_len), dtype=torch.long) + + # Fill tensors and pad metadata + for i, ids in enumerate(all_ids): + length = min(len(ids), seq_len) + input_ids[i, :length] = torch.tensor(ids[:length], dtype=torch.long) + attention_mask[i, :length] = 1 + + # Pad offsets and word_ids to match sequence length + if return_offsets_mapping: + # Pad with (0, 0) for padding tokens + all_offsets[i] = all_offsets[i][:length] + [(0, 0)] * (seq_len - length) + + if return_word_ids: + # Pad with None for padding tokens + all_word_ids[i] = all_word_ids[i][:length] + [None] * (seq_len - length) + + return input_ids, attention_mask, all_offsets, all_word_ids + + +# ============================================================================ +# NGramTokenizer - Optimized +# ============================================================================ + + +class NGramTokenizer(BaseTokenizer): + """ + Heavily optimized FastText N-gram tokenizer with: + - Pre-computed subword cache for entire vocabulary + - Vectorized batch encoding + - Cached text normalization + - Direct tensor operations + - Optional offset mapping and word ID tracking + """ + + PAD_TOKEN = "[PAD]" + UNK_TOKEN = "[UNK]" + EOS_TOKEN = "" + + def __init__( + self, + min_count: int, + min_n: int, + max_n: int, + num_tokens: int, + len_word_ngrams: int, + training_text: Optional[List[str]] = None, + preprocess: bool = True, + output_dim: Optional[int] = None, + **kwargs, + ): + if min_n < 2: + raise ValueError("min_n must be >= 2") + if max_n > 6: + raise ValueError("max_n must be <= 6") + + self.min_count = min_count + self.min_n = min_n + self.max_n = max_n + self.num_tokens = num_tokens + self.word_ngrams = len_word_ngrams + self.preprocess = preprocess + + self.pad_token_id = 0 + self.unk_token_id = 1 + self.eos_token_id = 2 + + if training_text is not None: + self.train(training_text) + else: + self.word_to_id = {} + self.id_to_word = {} + self.nwords = 0 + self.subword_cache = None + + self.vocab_size = 3 + self.nwords + self.num_tokens + + super().__init__( + vocab_size=self.vocab_size, padding_idx=self.pad_token_id, output_dim=output_dim + ) + + def train(self, training_text: List[str]): + """Build vocabulary from training text.""" + word_counts = {} + for sent in training_text: + for w in sent.split(): + word_counts[w] = word_counts.get(w, 0) + 1 + + self.word_to_id = {} + idx = 3 + for w, c in word_counts.items(): + if c >= self.min_count: + self.word_to_id[w] = idx + idx += 1 + + self.nwords = len(self.word_to_id) + self.vocab_size = 3 + self.nwords + self.num_tokens + + # Create reverse mapping + self.id_to_word = {v: k for k, v in self.word_to_id.items()} + self.id_to_word[self.pad_token_id] = self.PAD_TOKEN + self.id_to_word[self.unk_token_id] = self.UNK_TOKEN + self.id_to_word[self.eos_token_id] = self.EOS_TOKEN + + # Pre-compute all subwords for vocabulary + print(f"Pre-computing subwords for {self.nwords} vocabulary words...") + self.subword_cache = SubwordCache( + self.word_to_id, self.min_n, self.max_n, self.num_tokens, self.nwords, self.unk_token_id + ) + print("✓ Subword cache built") + + def tokenize( + self, + text: Union[str, List[str]], + return_offsets_mapping: bool = False, + return_word_ids: bool = False, + **kwargs, + ) -> TokenizerOutput: + """ + Optimized tokenization with vectorized operations. + + Args: + text: Single string or list of strings to tokenize + padding: Padding strategy ('longest' or 'max_length') + max_length: Maximum sequence length + truncation: Whether to truncate sequences exceeding max_length + return_offsets_mapping: If True, return character offsets for each token + return_word_ids: If True, return word indices for each token + + Returns: + TokenizerOutput with input_ids, attention_mask, and optionally + offset_mapping and word_ids + """ + is_single = isinstance(text, str) + if is_single: + text = [text] + + # Fast cached text cleaning + if self.preprocess: + text = clean_text_feature(text) + + if self.output_dim is not None: + max_length = self.output_dim + truncation = True + else: + max_length = None + truncation = False + + # Vectorized encoding + input_ids, attention_mask, offsets, word_ids = encode_batch_vectorized( + text, + self.subword_cache, + self.eos_token_id, + self.pad_token_id, + max_length=max_length, + truncation=truncation, + return_offsets_mapping=return_offsets_mapping, + return_word_ids=return_word_ids, + ) + + offsets = torch.tensor(offsets) if return_offsets_mapping else None + word_ids = np.array(word_ids) if return_word_ids else None + + return TokenizerOutput( + input_ids=input_ids, + attention_mask=attention_mask, + word_ids=word_ids, + offset_mapping=offsets, + ) + + def decode( + self, token_ids: Union[List[int], torch.Tensor], skip_special_tokens: bool = True + ) -> str: + """Decode token IDs back to text.""" + if isinstance(token_ids, torch.Tensor): + token_ids = token_ids.tolist() + + tokens = [] + for id_ in token_ids: + if id_ == self.pad_token_id and skip_special_tokens: + continue + + if id_ == self.eos_token_id: + if not skip_special_tokens: + tokens.append(self.EOS_TOKEN) + continue + + if id_ in self.id_to_word: + tokens.append(self.id_to_word[id_]) + elif not skip_special_tokens: + tokens.append(f"[ID:{id_}]") + + return " ".join(tokens) + + def batch_decode( + self, sequences: Union[List[List[int]], torch.Tensor], skip_special_tokens: bool = True + ) -> List[str]: + """Decode multiple sequences.""" + if isinstance(sequences, torch.Tensor): + sequences = sequences.tolist() + return [self.decode(seq, skip_special_tokens) for seq in sequences] + + def save_pretrained(self, save_directory: str): + """Save tokenizer configuration and vocabulary.""" + import os + + os.makedirs(save_directory, exist_ok=True) + + config = { + "min_count": self.min_count, + "min_n": self.min_n, + "max_n": self.max_n, + "num_tokens": self.num_tokens, + "len_word_ngrams": self.word_ngrams, + "word_to_id": self.word_to_id, + "preprocess": self.preprocess, + "vocab_size": self.vocab_size, + "nwords": self.nwords, + } + + with open(f"{save_directory}/tokenizer.json", "w") as f: + json.dump(config, f, indent=2) + + print(f"✓ Tokenizer saved to {save_directory}") + + @classmethod + def from_pretrained(cls, directory: str): + """Load tokenizer from saved configuration.""" + with open(f"{directory}/tokenizer.json", "r") as f: + config = json.load(f) + + tokenizer = cls( + min_count=config["min_count"], + min_n=config["min_n"], + max_n=config["max_n"], + num_tokens=config["num_tokens"], + len_word_ngrams=config["len_word_ngrams"], + preprocess=config["preprocess"], + training_text=None, + ) + + tokenizer.word_to_id = config["word_to_id"] + tokenizer.nwords = config["nwords"] + tokenizer.vocab_size = config["vocab_size"] + + tokenizer.id_to_word = {v: k for k, v in tokenizer.word_to_id.items()} + tokenizer.id_to_word[tokenizer.pad_token_id] = cls.PAD_TOKEN + tokenizer.id_to_word[tokenizer.unk_token_id] = cls.UNK_TOKEN + tokenizer.id_to_word[tokenizer.eos_token_id] = cls.EOS_TOKEN + + # Rebuild subword cache + print("Rebuilding subword cache...") + tokenizer.subword_cache = SubwordCache( + tokenizer.word_to_id, + tokenizer.min_n, + tokenizer.max_n, + tokenizer.num_tokens, + tokenizer.nwords, + tokenizer.unk_token_id, + ) + print("✓ Subword cache built") + + print(f"✓ Tokenizer loaded from {directory}") + return tokenizer diff --git a/torchTextClassifiers/torchTextClassifiers.py b/torchTextClassifiers/torchTextClassifiers.py index 1629f4a..66b285d 100644 --- a/torchTextClassifiers/torchTextClassifiers.py +++ b/torchTextClassifiers/torchTextClassifiers.py @@ -1,7 +1,15 @@ import logging import time -import json -from typing import Optional, Union, Type, List, Dict, Any +from dataclasses import asdict, dataclass, field +from typing import Any, Dict, List, Optional, Tuple, Type, Union + +try: + from captum.attr import LayerIntegratedGradients + + HAS_CAPTUM = True +except ImportError: + HAS_CAPTUM = False + import numpy as np import pytorch_lightning as pl @@ -12,9 +20,17 @@ ModelCheckpoint, ) -from .utilities.checkers import check_X, check_Y, NumpyJSONEncoder -from .classifiers.base import BaseClassifierConfig, BaseClassifierWrapper - +from torchTextClassifiers.dataset import TextClassificationDataset +from torchTextClassifiers.model import TextClassificationModel, TextClassificationModule +from torchTextClassifiers.model.components import ( + AttentionConfig, + CategoricalForwardType, + CategoricalVariableNet, + ClassificationHead, + TextEmbedder, + TextEmbedderConfig, +) +from torchTextClassifiers.tokenizers import BaseTokenizer, TokenizerOutput logger = logging.getLogger(__name__) @@ -26,296 +42,256 @@ ) +@dataclass +class ModelConfig: + """Base configuration class for text classifiers.""" + + embedding_dim: int + categorical_vocabulary_sizes: Optional[List[int]] = None + categorical_embedding_dims: Optional[Union[List[int], int]] = None + num_classes: Optional[int] = None + attention_config: Optional[AttentionConfig] = None + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "ModelConfig": + return cls(**data) + + +@dataclass +class TrainingConfig: + num_epochs: int + batch_size: int + lr: float + loss: torch.nn.Module = field(default_factory=lambda: torch.nn.CrossEntropyLoss()) + optimizer: Type[torch.optim.Optimizer] = torch.optim.Adam + scheduler: Optional[Type[torch.optim.lr_scheduler._LRScheduler]] = None + accelerator: str = "auto" + num_workers: int = 12 + patience_early_stopping: int = 3 + dataloader_params: Optional[dict] = None + trainer_params: Optional[dict] = None + optimizer_params: Optional[dict] = None + scheduler_params: Optional[dict] = None + + def to_dict(self) -> Dict[str, Any]: + data = asdict(self) + # Serialize loss and scheduler as their class names + data["loss"] = self.loss.__class__.__name__ + if self.scheduler is not None: + data["scheduler"] = self.scheduler.__name__ + return data class torchTextClassifiers: """Generic text classifier framework supporting multiple architectures. - - This is the main class that provides a unified interface for different types - of text classifiers. It acts as a high-level wrapper that delegates operations - to specific classifier implementations while providing a consistent API. - - The class supports the full machine learning workflow including: - - Building tokenizers from training data - - Model training with validation - - Prediction and evaluation - - Model serialization and loading - - Attributes: - config: Configuration object specific to the classifier type - classifier: The underlying classifier implementation - - Example: - >>> from torchTextClassifiers import torchTextClassifiers - >>> from torchTextClassifiers.classifiers.fasttext.config import FastTextConfig - >>> from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper - >>> - >>> # Create configuration - >>> config = FastTextConfig( - ... embedding_dim=100, - ... num_tokens=10000, - ... min_count=1, - ... min_n=3, - ... max_n=6, - ... len_word_ngrams=2, - ... num_classes=2 - ... ) - >>> - >>> # Initialize classifier with wrapper - >>> wrapper = FastTextWrapper(config) - >>> classifier = torchTextClassifiers(wrapper) - >>> - >>> # Build and train - >>> classifier.build(X_train, y_train) - >>> classifier.train(X_train, y_train, X_val, y_val, num_epochs=10, batch_size=32) - >>> - >>> # Predict - >>> predictions = classifier.predict(X_test) + + Given a tokenizer and model configuration, this class initializes: + - Text embedding layer (if needed) + - Categorical variable embedding network (if categorical variables are provided) + - Classification head + The resulting model can be trained using PyTorch Lightning and used for predictions. + """ - - def __init__(self, classifier: BaseClassifierWrapper): - """Initialize the torchTextClassifiers instance. - - Args: - classifier: An instance of a classifier wrapper that implements BaseClassifierWrapper - - Example: - >>> from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper - >>> from torchTextClassifiers.classifiers.fasttext.config import FastTextConfig - >>> config = FastTextConfig(embedding_dim=50, num_tokens=5000) - >>> wrapper = FastTextWrapper(config) - >>> classifier = torchTextClassifiers(wrapper) - """ - self.classifier = classifier - self.config = classifier.config - - - def build_tokenizer(self, training_text: np.ndarray) -> None: - """Build tokenizer from training text data. - - This method is kept for backward compatibility. It delegates to - prepare_text_features which handles the actual text preprocessing. - - Args: - training_text: Array of text strings to build the tokenizer from - - Example: - >>> import numpy as np - >>> texts = np.array(["Hello world", "This is a test", "Another example"]) - >>> classifier.build_tokenizer(texts) - """ - self.classifier.prepare_text_features(training_text) - - def prepare_text_features(self, training_text: np.ndarray) -> None: - """Prepare text features for the classifier. - - This method handles text preprocessing which could involve tokenization, - vectorization, or other approaches depending on the classifier type. - - Args: - training_text: Array of text strings to prepare features from - - Example: - >>> import numpy as np - >>> texts = np.array(["Hello world", "This is a test", "Another example"]) - >>> classifier.prepare_text_features(texts) - """ - self.classifier.prepare_text_features(training_text) - - def build( + + def __init__( self, - X_train: np.ndarray, - y_train: np.ndarray = None, - lightning=True, - **kwargs - ) -> None: - """Build the complete classifier from training data. - - This method handles the full model building process including: - - Input validation and preprocessing - - Tokenizer creation from training text - - Model architecture initialization - - Lightning module setup (if enabled) - + tokenizer: BaseTokenizer, + model_config: ModelConfig, + ): + """Initialize the torchTextClassifiers instance. + Args: - X_train: Training input data (text and optional categorical features) - y_train: Training labels (optional, can be inferred if num_classes is set) - lightning: Whether to initialize PyTorch Lightning components - **kwargs: Additional arguments passed to Lightning initialization - - Raises: - ValueError: If y_train is None and num_classes is not set in config - ValueError: If label values are outside expected range - + tokenizer: A tokenizer instance for text preprocessing + model_config: Configuration parameters for the text classification model + Example: - >>> X_train = np.array(["text sample 1", "text sample 2"]) - >>> y_train = np.array([0, 1]) - >>> classifier.build(X_train, y_train) + >>> from torchTextClassifiers import ModelConfig, TrainingConfig, torchTextClassifiers + >>> # Assume tokenizer is a trained BaseTokenizer instance + >>> model_config = ModelConfig( + ... embedding_dim=10, + ... categorical_vocabulary_sizes=[30, 25], + ... categorical_embedding_dims=[10, 5], + ... num_classes=10, + ... ) + >>> ttc = torchTextClassifiers( + ... tokenizer=tokenizer, + ... model_config=model_config, + ... ) """ - training_text, categorical_variables, no_cat_var = check_X(X_train) - - if y_train is not None: - if self.config.num_classes is not None: - if self.config.num_classes != len(np.unique(y_train)): - logger.warning( - f"Updating num_classes from {self.config.num_classes} to {len(np.unique(y_train))}" - ) - - y_train = check_Y(y_train) - self.config.num_classes = len(np.unique(y_train)) - - if np.max(y_train) >= self.config.num_classes: - raise ValueError( - "y_train must contain values between 0 and num_classes-1" + + self.model_config = model_config + self.tokenizer = tokenizer + + if hasattr(self.tokenizer, "trained"): + if not self.tokenizer.trained: + raise RuntimeError( + f"Tokenizer {type(self.tokenizer)} must be trained before initializing the classifier." ) + + self.vocab_size = tokenizer.vocab_size + self.embedding_dim = model_config.embedding_dim + self.categorical_vocabulary_sizes = model_config.categorical_vocabulary_sizes + self.num_classes = model_config.num_classes + + if self.tokenizer.output_vectorized: + self.text_embedder = None + logger.info( + "Tokenizer outputs vectorized tokens; skipping TextEmbedder initialization." + ) + self.embedding_dim = self.tokenizer.output_dim else: - if self.config.num_classes is None: - raise ValueError( - "Either num_classes must be provided at init or y_train must be provided here." - ) - - # Handle categorical variables - if not no_cat_var: - if hasattr(self.config, 'num_categorical_features') and self.config.num_categorical_features is not None: - if self.config.num_categorical_features != categorical_variables.shape[1]: - logger.warning( - f"Updating num_categorical_features from {self.config.num_categorical_features} to {categorical_variables.shape[1]}" - ) - - if hasattr(self.config, 'num_categorical_features'): - self.config.num_categorical_features = categorical_variables.shape[1] - - categorical_vocabulary_sizes = np.max(categorical_variables, axis=0) + 1 - - if hasattr(self.config, 'categorical_vocabulary_sizes') and self.config.categorical_vocabulary_sizes is not None: - if self.config.categorical_vocabulary_sizes != list(categorical_vocabulary_sizes): - logger.warning( - "Overwriting categorical_vocabulary_sizes with values from training data." - ) - if hasattr(self.config, 'categorical_vocabulary_sizes'): - self.config.categorical_vocabulary_sizes = list(categorical_vocabulary_sizes) - - self.classifier.prepare_text_features(training_text) - self.classifier._build_pytorch_model() - - if lightning: - self.classifier._check_and_init_lightning(**kwargs) - + text_embedder_config = TextEmbedderConfig( + vocab_size=self.vocab_size, + embedding_dim=self.embedding_dim, + padding_idx=tokenizer.padding_idx, + attention_config=model_config.attention_config, + ) + self.text_embedder = TextEmbedder( + text_embedder_config=text_embedder_config, + ) + + classif_head_input_dim = self.embedding_dim + if self.categorical_vocabulary_sizes: + self.categorical_var_net = CategoricalVariableNet( + categorical_vocabulary_sizes=self.categorical_vocabulary_sizes, + categorical_embedding_dims=model_config.categorical_embedding_dims, + text_embedding_dim=self.embedding_dim, + ) + + if self.categorical_var_net.forward_type != CategoricalForwardType.SUM_TO_TEXT: + classif_head_input_dim += self.categorical_var_net.output_dim + + else: + self.categorical_var_net = None + + self.classification_head = ClassificationHead( + input_dim=classif_head_input_dim, + num_classes=model_config.num_classes, + ) + + self.pytorch_model = TextClassificationModel( + text_embedder=self.text_embedder, + categorical_variable_net=self.categorical_var_net, + classification_head=self.classification_head, + ) + def train( self, X_train: np.ndarray, y_train: np.ndarray, X_val: np.ndarray, y_val: np.ndarray, - num_epochs: int, - batch_size: int, - cpu_run: bool = False, - num_workers: int = 12, - patience_train: int = 3, + training_config: TrainingConfig, verbose: bool = False, - trainer_params: Optional[dict] = None, - **kwargs ) -> None: """Train the classifier using PyTorch Lightning. - + This method handles the complete training process including: - Data validation and preprocessing - Dataset and DataLoader creation - PyTorch Lightning trainer setup with callbacks - Model training with early stopping - Best model loading after training - + Args: X_train: Training input data y_train: Training labels X_val: Validation input data y_val: Validation labels - num_epochs: Maximum number of training epochs - batch_size: Batch size for training and validation - cpu_run: If True, force training on CPU instead of GPU - num_workers: Number of worker processes for data loading - patience_train: Number of epochs to wait for improvement before early stopping - verbose: If True, print detailed training progress - trainer_params: Additional parameters to pass to PyTorch Lightning Trainer - **kwargs: Additional arguments passed to the build method - + training_config: Configuration parameters for training + verbose: Whether to print training progress information + + Example: - >>> classifier.train( - ... X_train, y_train, X_val, y_val, - ... num_epochs=50, - ... batch_size=32, - ... patience_train=5, - ... verbose=True - ... ) + + >>> training_config = TrainingConfig( + ... lr=1e-3, + ... batch_size=4, + ... num_epochs=1, + ... ) + >>> ttc.train( + ... X_train=X, + ... y_train=Y, + ... X_val=X, + ... y_val=Y, + ... training_config=training_config, + ... ) """ # Input validation - training_text, train_categorical_variables, train_no_cat_var = check_X(X_train) - val_text, val_categorical_variables, val_no_cat_var = check_X(X_val) - y_train = check_Y(y_train) - y_val = check_Y(y_val) - - # Consistency checks - assert train_no_cat_var == val_no_cat_var, ( - "X_train and X_val must have the same number of categorical variables." - ) - assert X_train.shape[0] == y_train.shape[0], ( - "X_train and y_train must have the same number of observations." - ) - assert X_train.ndim > 1 and X_train.shape[1] == X_val.shape[1] or X_val.ndim == 1, ( - "X_train and X_val must have the same number of columns." - ) - + X_train, y_train = self._check_XY(X_train, y_train) + X_val, y_val = self._check_XY(X_val, y_val) + + if ( + X_train["categorical_variables"] is not None + and X_val["categorical_variables"] is not None + ): + assert ( + X_train["categorical_variables"].ndim > 1 + and X_train["categorical_variables"].shape[1] + == X_val["categorical_variables"].shape[1] + or X_val["categorical_variables"].ndim == 1 + ), "X_train and X_val must have the same number of columns." + if verbose: logger.info("Starting training process...") - - # Device setup - if cpu_run: - device = torch.device("cpu") - else: + + if training_config.accelerator == "auto": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - self.classifier.device = device - + else: + device = torch.device(training_config.accelerator) + + self.device = device + + optimizer_params = {"lr": training_config.lr} + if training_config.optimizer_params is not None: + optimizer_params.update(training_config.optimizer_params) + + self.lightning_module = TextClassificationModule( + model=self.pytorch_model, + loss=training_config.loss, + optimizer=training_config.optimizer, + optimizer_params=optimizer_params, + scheduler=training_config.scheduler, + scheduler_params=training_config.scheduler_params + if training_config.scheduler_params + else {}, + scheduler_interval="epoch", + ) + + self.pytorch_model.to(self.device) + if verbose: logger.info(f"Running on: {device}") - - # Build model if not already built - if self.classifier.pytorch_model is None: - if verbose: - start = time.time() - logger.info("Building the model...") - self.build(X_train, y_train, **kwargs) - if verbose: - end = time.time() - logger.info(f"Model built in {end - start:.2f} seconds.") - - self.classifier.pytorch_model = self.classifier.pytorch_model.to(device) - - # Create datasets and dataloaders using wrapper methods - train_dataset = self.classifier.create_dataset( - texts=training_text, + + train_dataset = TextClassificationDataset( + texts=X_train["text"], + categorical_variables=X_train["categorical_variables"], # None if no cat vars + tokenizer=self.tokenizer, labels=y_train, - categorical_variables=train_categorical_variables, ) - val_dataset = self.classifier.create_dataset( - texts=val_text, + val_dataset = TextClassificationDataset( + texts=X_val["text"], + categorical_variables=X_val["categorical_variables"], # None if no cat vars + tokenizer=self.tokenizer, labels=y_val, - categorical_variables=val_categorical_variables, ) - - train_dataloader = self.classifier.create_dataloader( - dataset=train_dataset, - batch_size=batch_size, - num_workers=num_workers, - shuffle=True + + train_dataloader = train_dataset.create_dataloader( + batch_size=training_config.batch_size, + num_workers=training_config.num_workers, + shuffle=True, + **training_config.dataloader_params if training_config.dataloader_params else {}, ) - val_dataloader = self.classifier.create_dataloader( - dataset=val_dataset, - batch_size=batch_size, - num_workers=num_workers, - shuffle=False + val_dataloader = val_dataset.create_dataloader( + batch_size=training_config.batch_size, + num_workers=training_config.num_workers, + shuffle=False, + **training_config.dataloader_params if training_config.dataloader_params else {}, ) - + # Setup trainer callbacks = [ ModelCheckpoint( @@ -326,184 +302,266 @@ def train( ), EarlyStopping( monitor="val_loss", - patience=patience_train, + patience=training_config.patience_early_stopping, mode="min", ), LearningRateMonitor(logging_interval="step"), ] - - train_params = { + + trainer_params = { + "accelerator": training_config.accelerator, "callbacks": callbacks, - "max_epochs": num_epochs, + "max_epochs": training_config.num_epochs, "num_sanity_val_steps": 2, "strategy": "auto", "log_every_n_steps": 1, "enable_progress_bar": True, } - - if trainer_params is not None: - train_params.update(trainer_params) - - trainer = pl.Trainer(**train_params) - + + if training_config.trainer_params is not None: + trainer_params.update(training_config.trainer_params) + + trainer = pl.Trainer(**trainer_params) + torch.cuda.empty_cache() torch.set_float32_matmul_precision("medium") - + if verbose: logger.info("Launching training...") start = time.time() - - trainer.fit(self.classifier.lightning_module, train_dataloader, val_dataloader) - + + trainer.fit(self.lightning_module, train_dataloader, val_dataloader) + if verbose: end = time.time() logger.info(f"Training completed in {end - start:.2f} seconds.") - - # Load best model using wrapper method + best_model_path = trainer.checkpoint_callback.best_model_path - self.classifier.load_best_model(best_model_path) - - def predict(self, X: np.ndarray, **kwargs) -> np.ndarray: - """Make predictions on input data. - - Args: - X: Input data for prediction (text and optional categorical features) - **kwargs: Additional arguments passed to the underlying predictor - - Returns: - np.ndarray: Predicted class labels - - Example: - >>> X_test = np.array(["new text sample", "another sample"]) - >>> predictions = classifier.predict(X_test) - >>> print(predictions) # [0, 1] - """ - return self.classifier.predict(X, **kwargs) - - def validate(self, X: np.ndarray, Y: np.ndarray, **kwargs) -> float: - """Validate the model on test data. - - Args: - X: Input data for validation - Y: True labels for validation - **kwargs: Additional arguments passed to the validator - - Returns: - float: Validation accuracy score - - Example: - >>> accuracy = classifier.validate(X_test, y_test) - >>> print(f"Accuracy: {accuracy:.3f}") - """ - return self.classifier.validate(X, Y, **kwargs) - - def predict_and_explain(self, X: np.ndarray, **kwargs): - """Make predictions with explanations (if supported). - - This method provides both predictions and explanations for the model's - decisions. Availability depends on the specific classifier implementation. - + + self.lightning_module = TextClassificationModule.load_from_checkpoint( + best_model_path, + model=self.pytorch_model, + loss=training_config.loss, + ) + + self.pytorch_model = self.lightning_module.model.to(self.device) + + self.lightning_module.eval() + + def _check_XY(self, X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + X = self._check_X(X) + Y = self._check_Y(Y) + + if X["text"].shape[0] != Y.shape[0]: + raise ValueError("X_train and y_train must have the same number of observations.") + + return X, Y + + @staticmethod + def _check_text_col(X): + assert isinstance( + X, np.ndarray + ), "X must be a numpy array of shape (N,d), with the first column being the text and the rest being the categorical variables." + + try: + if X.ndim > 1: + text = X[:, 0].astype(str) + else: + text = X[:].astype(str) + except ValueError: + logger.error("The first column of X must be castable in string format.") + + return text + + def _check_categorical_variables(self, X: np.ndarray) -> None: + """Check if categorical variables in X match training configuration. + Args: - X: Input data for prediction - **kwargs: Additional arguments passed to the explainer - - Returns: - tuple: (predictions, explanations) where explanations format depends - on the classifier type - + X: Input data to check + Raises: - NotImplementedError: If the classifier doesn't support explanations - - Example: - >>> predictions, explanations = classifier.predict_and_explain(X_test) - >>> print(f"Predictions: {predictions}") - >>> print(f"Explanations: {explanations}") + ValueError: If the number of categorical variables does not match + the training configuration """ - if hasattr(self.classifier, 'predict_and_explain'): - return self.classifier.predict_and_explain(X, **kwargs) + + assert self.categorical_var_net is not None + + if X.ndim > 1: + num_cat_vars = X.shape[1] - 1 else: - raise NotImplementedError(f"Explanation not supported for {type(self.classifier).__name__}") - - def to_json(self, filepath: str) -> None: - """Save classifier configuration to JSON file. - - This method serializes the classifier configuration to a JSON - file. Note: This only saves configuration, not trained model weights. - Custom classifier wrappers should implement a class method `get_wrapper_class_info()` - that returns a dict with 'module' and 'class_name' keys for proper reconstruction. - - Args: - filepath: Path where to save the JSON configuration file - - Example: - >>> classifier.to_json('my_classifier_config.json') + num_cat_vars = 0 + + if num_cat_vars != self.categorical_var_net.num_categorical_features: + raise ValueError( + f"X must have the same number of categorical variables as the number of embedding layers in the categorical net: ({self.categorical_var_net.num_categorical_features})." + ) + + try: + categorical_variables = X[:, 1:].astype(int) + except ValueError: + logger.error( + f"Columns {1} to {X.shape[1] - 1} of X_train must be castable in integer format." + ) + + for j in range(X.shape[1] - 1): + max_cat_value = categorical_variables[:, j].max() + if max_cat_value >= self.categorical_var_net.categorical_vocabulary_sizes[j]: + raise ValueError( + f"Categorical variable at index {j} has value {max_cat_value} which exceeds the vocabulary size of {self.categorical_var_net.categorical_vocabulary_sizes[j]}." + ) + + return categorical_variables + + def _check_X(self, X: np.ndarray) -> np.ndarray: + text = self._check_text_col(X) + + categorical_variables = None + if self.categorical_var_net is not None: + categorical_variables = self._check_categorical_variables(X) + + return {"text": text, "categorical_variables": categorical_variables} + + def _check_Y(self, Y): + assert isinstance(Y, np.ndarray), "Y must be a numpy array of shape (N,) or (N,1)." + assert len(Y.shape) == 1 or ( + len(Y.shape) == 2 and Y.shape[1] == 1 + ), "Y must be a numpy array of shape (N,) or (N,1)." + + try: + Y = Y.astype(int) + except ValueError: + logger.error("Y must be castable in integer format.") + + if Y.max() >= self.num_classes or Y.min() < 0: + raise ValueError( + f"Y contains class labels outside the range [0, {self.num_classes - 1}]." + ) + + return Y + + def predict( + self, + X_test: np.ndarray, + top_k=1, + explain=False, + ): """ - with open(filepath, "w") as f: - data = { - "config": self.config.to_dict(), - } - - # Try to get wrapper class info for reconstruction - if hasattr(self.classifier.__class__, 'get_wrapper_class_info'): - data["wrapper_class_info"] = self.classifier.__class__.get_wrapper_class_info() - else: - # Fallback: store module and class name - data["wrapper_class_info"] = { - "module": self.classifier.__class__.__module__, - "class_name": self.classifier.__class__.__name__ - } - - json.dump(data, f, cls=NumpyJSONEncoder, indent=4) - - @classmethod - def from_json(cls, filepath: str, wrapper_class: Optional[Type[BaseClassifierWrapper]] = None) -> "torchTextClassifiers": - """Load classifier configuration from JSON file. - - This method creates a new classifier instance from a previously saved - configuration file. The classifier will need to be built and trained again. - Args: - filepath: Path to the JSON configuration file - wrapper_class: Optional wrapper class to use. If not provided, will try to - reconstruct from saved wrapper_class_info - - Returns: - torchTextClassifiers: New classifier instance with loaded configuration - - Raises: - ImportError: If the wrapper class cannot be imported - FileNotFoundError: If the configuration file doesn't exist - - Example: - >>> # Using saved wrapper class info - >>> classifier = torchTextClassifiers.from_json('my_classifier_config.json') - >>> - >>> # Or providing wrapper class explicitly - >>> from torchTextClassifiers.classifiers.fasttext.wrapper import FastTextWrapper - >>> classifier = torchTextClassifiers.from_json('config.json', FastTextWrapper) + X_test (np.ndarray): input data to predict on, shape (N,d) where the first column is text and the rest are categorical variables + top_k (int): for each sentence, return the top_k most likely predictions (default: 1) + explain (bool): launch gradient integration to have an explanation of the prediction (default: False) + + Returns: A dictionary containing the following fields: + - predictions (torch.Tensor, shape (len(text), top_k)): A tensor containing the top_k most likely codes to the query. + - confidence (torch.Tensor, shape (len(text), top_k)): A tensor array containing the corresponding confidence scores. + - if explain is True: + - attributions (torch.Tensor, shape (len(text), top_k, seq_len)): A tensor containing the attributions for each token in the text. """ - with open(filepath, "r") as f: - data = json.load(f) - - if wrapper_class is None: - # Try to reconstruct wrapper class from saved info - if "wrapper_class_info" not in data: - raise ValueError("No wrapper_class_info found in config file and no wrapper_class provided") - - wrapper_info = data["wrapper_class_info"] - module_name = wrapper_info["module"] - class_name = wrapper_info["class_name"] - - # Dynamically import the wrapper class - import importlib - module = importlib.import_module(module_name) - wrapper_class = getattr(module, class_name) - - # Reconstruct config using wrapper class's config class - config_class = wrapper_class.get_config_class() - config = config_class.from_dict(data["config"]) - - # Create wrapper instance - wrapper = wrapper_class(config) - - return cls(wrapper) \ No newline at end of file + + if explain: + return_offsets_mapping = True # to be passed to the tokenizer + return_word_ids = True + if self.pytorch_model.text_embedder is None: + raise RuntimeError( + "Explainability is not supported when the tokenizer outputs vectorized text directly. Please use a tokenizer that outputs token IDs." + ) + else: + if not HAS_CAPTUM: + raise ImportError( + "Captum is not installed and is required for explainability. Run 'pip install/uv add torchFastText[explainability]'." + ) + lig = LayerIntegratedGradients( + self.pytorch_model, self.pytorch_model.text_embedder.embedding_layer + ) # initialize a Captum layer gradient integrator + else: + return_offsets_mapping = False + return_word_ids = False + + X_test = self._check_X(X_test) + text = X_test["text"] + categorical_variables = X_test["categorical_variables"] + + self.pytorch_model.eval().cpu() + + tokenize_output = self.tokenizer.tokenize( + text.tolist(), + return_offsets_mapping=return_offsets_mapping, + return_word_ids=return_word_ids, + ) + + if not isinstance(tokenize_output, TokenizerOutput): + raise TypeError( + f"Expected TokenizerOutput, got {type(tokenize_output)} from tokenizer.tokenize method." + ) + + encoded_text = tokenize_output.input_ids # (batch_size, seq_len) + attention_mask = tokenize_output.attention_mask # (batch_size, seq_len) + + if categorical_variables is not None: + categorical_vars = torch.tensor( + categorical_variables, dtype=torch.float32 + ) # (batch_size, num_categorical_features) + else: + categorical_vars = torch.empty((encoded_text.shape[0], 0), dtype=torch.float32) + + pred = self.pytorch_model( + encoded_text, attention_mask, categorical_vars + ) # forward pass, contains the prediction scores (len(text), num_classes) + + label_scores = pred.detach().cpu().softmax(dim=1) # convert to probabilities + + label_scores_topk = torch.topk(label_scores, k=top_k, dim=1) + + predictions = label_scores_topk.indices # get the top_k most likely predictions + confidence = torch.round(label_scores_topk.values, decimals=2) # and their scores + + if explain: + all_attributions = [] + for k in range(top_k): + attributions = lig.attribute( + (encoded_text, attention_mask, categorical_vars), + target=torch.Tensor(predictions[:, k]).long(), + ) # (batch_size, seq_len) + attributions = attributions.sum(dim=-1) + all_attributions.append(attributions.detach().cpu()) + + all_attributions = torch.stack(all_attributions, dim=1) # (batch_size, top_k, seq_len) + + return { + "prediction": predictions, + "confidence": confidence, + "attributions": all_attributions, + "offset_mapping": tokenize_output.offset_mapping, + "word_ids": tokenize_output.word_ids, + } + else: + return { + "prediction": predictions, + "confidence": confidence, + } + + def __repr__(self): + model_type = ( + self.lightning_module.__repr__() + if hasattr(self, "lightning_module") + else self.pytorch_model.__repr__() + ) + + tokenizer_info = self.tokenizer.__repr__() + + cat_forward_type = ( + self.categorical_var_net.forward_type.name + if self.categorical_var_net is not None + else "None" + ) + + lines = [ + "torchTextClassifiers(", + f" tokenizer = {tokenizer_info},", + f" model = {model_type},", + f" categorical_forward_type = {cat_forward_type},", + f" num_classes = {self.model_config.num_classes},", + f" embedding_dim = {self.embedding_dim},", + ")", + ] + return "\n".join(lines) diff --git a/torchTextClassifiers/utilities/__init__.py b/torchTextClassifiers/utilities/__init__.py index 84d1550..e69de29 100644 --- a/torchTextClassifiers/utilities/__init__.py +++ b/torchTextClassifiers/utilities/__init__.py @@ -1,3 +0,0 @@ -""" -Init script. -""" diff --git a/torchTextClassifiers/utilities/checkers.py b/torchTextClassifiers/utilities/checkers.py deleted file mode 100644 index 1fef3fc..0000000 --- a/torchTextClassifiers/utilities/checkers.py +++ /dev/null @@ -1,108 +0,0 @@ -import logging -import json -from typing import Optional, Union, Type, List - -import numpy as np - -logger = logging.getLogger(__name__) - - -def check_X(X): - assert isinstance(X, np.ndarray), ( - "X must be a numpy array of shape (N,d), with the first column being the text and the rest being the categorical variables." - ) - - try: - if X.ndim > 1: - text = X[:, 0].astype(str) - else: - text = X[:].astype(str) - except ValueError: - logger.error("The first column of X must be castable in string format.") - - if len(X.shape) == 1 or (len(X.shape) == 2 and X.shape[1] == 1): - no_cat_var = True - else: - no_cat_var = False - - if not no_cat_var: - try: - categorical_variables = X[:, 1:].astype(int) - except ValueError: - logger.error( - f"Columns {1} to {X.shape[1] - 1} of X_train must be castable in integer format." - ) - else: - categorical_variables = None - - return text, categorical_variables, no_cat_var - - -def check_Y(Y): - assert isinstance(Y, np.ndarray), "Y must be a numpy array of shape (N,) or (N,1)." - assert len(Y.shape) == 1 or (len(Y.shape) == 2 and Y.shape[1] == 1), ( - "Y must be a numpy array of shape (N,) or (N,1)." - ) - - try: - Y = Y.astype(int) - except ValueError: - logger.error("Y must be castable in integer format.") - - return Y - - -def validate_categorical_inputs( - categorical_vocabulary_sizes: List[int], - categorical_embedding_dims: Union[List[int], int], - num_categorical_features: int = None, -): - if categorical_vocabulary_sizes is None: - logger.warning("No categorical_vocabulary_sizes. It will be inferred later.") - return None, None, None - - else: - if not isinstance(categorical_vocabulary_sizes, list): - raise TypeError("categorical_vocabulary_sizes must be a list of int") - - if isinstance(categorical_embedding_dims, list): - if len(categorical_vocabulary_sizes) != len(categorical_embedding_dims): - raise ValueError( - "Categorical vocabulary sizes and their embedding dimensions must have the same length" - ) - - if num_categorical_features is not None: - if len(categorical_vocabulary_sizes) != num_categorical_features: - raise ValueError( - "len(categorical_vocabulary_sizes) must be equal to num_categorical_features" - ) - else: - num_categorical_features = len(categorical_vocabulary_sizes) - - assert num_categorical_features is not None, ( - "num_categorical_features should be inferred at this point." - ) - - # "Transform" embedding dims into a suitable list, or stay None - if categorical_embedding_dims is not None: - if isinstance(categorical_embedding_dims, int): - categorical_embedding_dims = [categorical_embedding_dims] * num_categorical_features - elif not isinstance(categorical_embedding_dims, list): - raise TypeError("categorical_embedding_dims must be an int or a list of int") - - assert isinstance(categorical_embedding_dims, list) or categorical_embedding_dims is None, ( - "categorical_embedding_dims must be a list of int at this point" - ) - - return categorical_vocabulary_sizes, categorical_embedding_dims, num_categorical_features - - -class NumpyJSONEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, np.integer): - return int(obj) - if isinstance(obj, np.floating): - return float(obj) - if isinstance(obj, np.ndarray): - return obj.tolist() - return super().default(obj) diff --git a/torchTextClassifiers/utilities/plot_explainability.py b/torchTextClassifiers/utilities/plot_explainability.py new file mode 100644 index 0000000..a5ad7f8 --- /dev/null +++ b/torchTextClassifiers/utilities/plot_explainability.py @@ -0,0 +1,184 @@ +from typing import List, Optional + +import numpy as np +import torch + +try: + from matplotlib import pyplot as plt + + HAS_PYPLOT = True +except ImportError: + HAS_PYPLOT = False + + +def map_attributions_to_char(attributions, offsets, text): + """ + Maps token-level attributions to character-level attributions based on token offsets. + Args: + attributions (np.ndarray): Array of shape (top_k, seq_len) or (seq_len,) containing token-level attributions. + Output from: + >>> ttc.predict(X, top_k=top_k, explain=True)["attributions"] + offsets (list of tuples): List of (start, end) offsets for each token in the original text. + Output from: + >>> ttc.predict(X, top_k=top_k, explain=True)["offset_mapping"] + Also from: + >>> ttc.tokenizer.tokenize(text, return_offsets_mapping=True)["offset_mapping"] + text (str): The original input text. + + Returns: + np.ndarray: Array of shape (top_k, text_len) containing character-level attributions. + text_len is the number of characters in the original text. + + """ + + if isinstance(text, list): + raise ValueError("text must be a single string, not a list of strings.") + + assert isinstance(text, str), "text must be a string." + + if isinstance(attributions, torch.Tensor): + attributions = attributions.cpu().numpy() + + if attributions.ndim == 1: + attributions = attributions[None, :] + + attributions_per_char = np.zeros((attributions.shape[0], len(text))) # top_k, text_len + + for token_idx, (start, end) in enumerate(offsets): + if start == end: # skip special tokens + continue + attributions_per_char[:, start:end] = attributions[:, token_idx][:, None] + + return np.exp(attributions_per_char) / np.sum( + np.exp(attributions_per_char), axis=1, keepdims=True + ) # softmax normalization + + +def map_attributions_to_word(attributions, word_ids): + """ + Maps token-level attributions to word-level attributions based on word IDs. + Args: + attributions (np.ndarray): Array of shape (top_k, seq_len) or (seq_len,) containing token-level attributions. + Output from: + >>> ttc.predict(X, top_k=top_k, explain=True)["attributions"] + word_ids (list of int or None): List of word IDs for each token in the original text. + Output from: + >>> ttc.predict(X, top_k=top_k, explain=True)["word_ids"] + + Returns: + np.ndarray: Array of shape (top_k, num_words) containing word-level attributions. + num_words is the number of unique words in the original text. + """ + + word_ids = np.array(word_ids) + + # Convert None to -1 for easier processing (PAD tokens) + word_ids_int = np.array([x if x is not None else -1 for x in word_ids], dtype=int) + + # Filter out PAD tokens from attributions and word_ids + attributions = attributions[ + torch.arange(attributions.shape[0])[:, None], + torch.tensor(np.where(word_ids_int != -1)[0])[None, :], + ] + word_ids_int = word_ids_int[word_ids_int != -1] + unique_word_ids = np.unique(word_ids_int) + num_unique_words = len(unique_word_ids) + + top_k = attributions.shape[0] + attr_with_word_id = np.concat( + (attributions[:, :, None], np.tile(word_ids_int[None, :], reps=(top_k, 1))[:, :, None]), + axis=-1, + ) # top_k, seq_len, 2 + # last dim is 2: 0 is the attribution of the token, 1 is the word_id the token is associated to + + word_attributions = np.zeros((top_k, num_unique_words)) + for word_id in unique_word_ids: + mask = attr_with_word_id[:, :, 1] == word_id # top_k, seq_len + word_attributions[:, word_id] = (attr_with_word_id[:, :, 0] * mask).sum( + axis=1 + ) # zero-out non-matching tokens and sum attributions for all tokens belonging to the same word + + # assert word_attributions.sum(axis=1) == attributions.sum(axis=1), "Sum of word attributions per top_k must equal sum of token attributions per top_k." + return np.exp(word_attributions) / np.sum( + np.exp(word_attributions), axis=1, keepdims=True + ) # softmax normalization + + +def plot_attributions_at_char( + text: str, + attributions_per_char: np.ndarray, + figsize=(10, 2), + titles: Optional[List[str]] = None, +): + """ + Plots character-level attributions as a heatmap. + Args: + text (str): The original input text. + attributions_per_char (np.ndarray): Array of shape (top_k, text_len) containing character-level attributions. + Output from map_attributions_to_char function. + title (str): Title of the plot. + figsize (tuple): Figure size for the plot. + """ + + if not HAS_PYPLOT: + raise ImportError( + "matplotlib is required for plotting. Please install it to use this function." + ) + top_k = attributions_per_char.shape[0] + + all_plots = [] + for i in range(top_k): + fig, ax = plt.subplots(figsize=figsize) + ax.bar(range(len(text)), attributions_per_char[i]) + ax.set_xticks(np.arange(len(text))) + ax.set_xticklabels(list(text), rotation=90) + title = titles[i] if titles is not None else f"Attributions for Top {i+1} Prediction" + ax.set_title(title) + ax.set_xlabel("Characters in Text") + ax.set_ylabel("Top Predictions") + all_plots.append(fig) + + return all_plots + + +def plot_attributions_at_word( + text, attributions_per_word, figsize=(10, 2), titles: Optional[List[str]] = None +): + """ + Plots word-level attributions as a heatmap. + Args: + text (str): The original input text. + attributions_per_word (np.ndarray): Array of shape (top_k, num_words) containing word-level attributions. + Output from map_attributions_to_word function. + title (str): Title of the plot. + figsize (tuple): Figure size for the plot. + """ + + if not HAS_PYPLOT: + raise ImportError( + "matplotlib is required for plotting. Please install it to use this function." + ) + + words = text.split() + top_k = attributions_per_word.shape[0] + all_plots = [] + for i in range(top_k): + fig, ax = plt.subplots(figsize=figsize) + ax.bar(range(len(words)), attributions_per_word[i]) + ax.set_xticks(np.arange(len(words))) + ax.set_xticklabels(words, rotation=90) + title = titles[i] if titles is not None else f"Attributions for Top {i+1} Prediction" + ax.set_title(title) + ax.set_xlabel("Words in Text") + ax.set_ylabel("Attributions") + all_plots.append(fig) + + return all_plots + + +def figshow(figure): + # https://stackoverflow.com/questions/53088212/create-multiple-figures-in-pyplot-but-only-show-one + for i in plt.get_fignums(): + if figure != plt.figure(i): + plt.close(plt.figure(i)) + plt.show() diff --git a/torchTextClassifiers/utilities/preprocess.py b/torchTextClassifiers/utilities/preprocess.py deleted file mode 100644 index 900c427..0000000 --- a/torchTextClassifiers/utilities/preprocess.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -Processing fns. -""" - -import string - -import numpy as np - -try: - import nltk - from nltk.corpus import stopwords as ntlk_stopwords - from nltk.stem.snowball import SnowballStemmer - - HAS_NLTK = True -except ImportError: - HAS_NLTK = False - -try: - import unidecode - - HAS_UNIDECODE = True -except ImportError: - HAS_UNIDECODE = False - - -def clean_text_feature(text: list[str], remove_stop_words=True): - """ - Cleans a text feature. - - Args: - text (list[str]): List of text descriptions. - remove_stop_words (bool): If True, remove stopwords. - - Returns: - list[str]: List of cleaned text descriptions. - - """ - if not HAS_NLTK: - raise ImportError( - "nltk is not installed and is required for preprocessing. Run 'pip install torchFastText[preprocess]'." - ) - if not HAS_UNIDECODE: - raise ImportError( - "unidecode is not installed and is required for preprocessing. Run 'pip install torchFastText[preprocess]'." - ) - - # Define stopwords and stemmer - - nltk.download("stopwords", quiet=True) - stopwords = tuple(ntlk_stopwords.words("french")) + tuple(string.ascii_lowercase) - stemmer = SnowballStemmer(language="french") - - # Remove of accented characters - text = np.vectorize(unidecode.unidecode)(np.array(text)) - - # To lowercase - text = np.char.lower(text) - - # Remove one letter words - def mylambda(x): - return " ".join([w for w in x.split() if len(w) > 1]) - - text = np.vectorize(mylambda)(text) - - # Remove duplicate words and stopwords in texts - # Stem words - libs_token = [lib.split() for lib in text.tolist()] - libs_token = [ - sorted(set(libs_token[i]), key=libs_token[i].index) for i in range(len(libs_token)) - ] - if remove_stop_words: - text = [ - " ".join([stemmer.stem(word) for word in libs_token[i] if word not in stopwords]) - for i in range(len(libs_token)) - ] - else: - text = [ - " ".join([stemmer.stem(word) for word in libs_token[i]]) for i in range(len(libs_token)) - ] - - # Return clean DataFrame - return text diff --git a/torchTextClassifiers/utilities/utils.py b/torchTextClassifiers/utilities/utils.py deleted file mode 100644 index c8e216e..0000000 --- a/torchTextClassifiers/utilities/utils.py +++ /dev/null @@ -1,346 +0,0 @@ -""" -Utility functions. -""" - -import warnings -import difflib -from difflib import SequenceMatcher - -import torch -import torch.nn.functional as F - -from .preprocess import clean_text_feature - - -def preprocess_token(token): - preprocessed_token = token.replace("", "") - preprocessed_token = preprocessed_token.replace("<", "") - preprocessed_token = preprocessed_token.replace(">", "") - - preprocessed_token = preprocessed_token.split() - - return preprocessed_token - - -def map_processed_to_original(processed_words, original_words, n=1, cutoff=0.9): - """ - Map processed words to original words based on similarity scores. - - Args: - processed_words (List[str]): List of processed words. - original_words (List[str]): List of original words. - n (int): Number of closest processed words to consider for a given original word. - cutoff (float): Minimum similarity score for a match. - - Returns: - Dict[str, str]: Mapping from original word to the corresponding closest processed word. - """ - - # For each word in the original list, find the n closest matching processed words - word_mapping = {} - - for original_word in original_words: - original_word_prepro = clean_text_feature([original_word], remove_stop_words=False)[ - 0 - ] # Preprocess the original word - - if original_word_prepro == "": - continue - - max_similarity_score = 0 - best_processed_word = None - # Calculate the similarity score for each processed word with the current original word - for processed_word in processed_words: - similarity_score = difflib.SequenceMatcher( - None, processed_word, original_word_prepro - ).ratio() # Ratcliff-Obershelp algorithm - - # Only consider matches with similarity above the cutoff - if similarity_score > max_similarity_score and similarity_score >= cutoff: - max_similarity_score = similarity_score - best_processed_word = processed_word - - if best_processed_word is not None: - # original_word = original_word.replace(',', '') - # Add the tuple (list of closest words, list of similarity scores) to the mapping - word_mapping[original_word] = best_processed_word - - return word_mapping - - -def test_end_of_word(all_processed_words, word, target_token, next_token, min_n): - flag = False - if target_token[-1] == ">": - if next_token[0] == "<": - if word in target_token: - flag = True - if word in next_token: - flag = False - if next_token[1] != word[0]: - flag = True - if len(next_token) == min_n: - flag = True - if next_token in all_processed_words: - flag = True - - return flag - - -def match_word_to_token_indexes(sentence, tokenized_sentence_tokens, min_n): - """ - Match words to token indexes in a sentence. - - Args: - sentence (str): Preprocessed sentence. - tokenized_sentence_tokens (List[str]): List of tokenized sentence tokens. - - Returns: - Dict[str, List[int]]: Mapping from word to list of token indexes. - - """ - - pointer_token = 0 - res = {} - processed_sentence = clean_text_feature([sentence], remove_stop_words=False)[0] - processed_words = processed_sentence.split() - # we know the tokens are in the right order - for index_word, word in enumerate(processed_words): - if word not in res: - res[word] = [] - - start = pointer_token - - # while we don't reach the end of the word, get going - while not test_end_of_word( - processed_words, - word, - tokenized_sentence_tokens[pointer_token], - tokenized_sentence_tokens[pointer_token + 1], - min_n=min_n, - ): - pointer_token += 1 - if pointer_token == len(tokenized_sentence_tokens) - 1: - warnings.warn("Error in the tokenization of the sentence") - # workaround to avoid error: each word is asociated to regular ranges - chunck = len(tokenized_sentence_tokens) // len(processed_words) - for idx, word in enumerate(processed_words): - res[word] = range( - idx * chunck, min((idx + 1) * chunck, len(tokenized_sentence_tokens)) - ) - return res - - pointer_token += 1 - end = pointer_token - - res[word] += list(range(start, end)) - - # here we arrive at the end of the sentence - assert tokenized_sentence_tokens[pointer_token] == "" - end_of_string_position = pointer_token - - # starting word n_gram - pointer_token += 1 - while pointer_token < len(tokenized_sentence_tokens): - token = tokenized_sentence_tokens[pointer_token] - for index_word, word in enumerate(processed_sentence.split()): - # now, the condition of matching changes: we need to find the word in the token - if word in token: - res[word].append(pointer_token) - pointer_token += 1 - - assert pointer_token == len(tokenized_sentence_tokens) - assert set(sum([v for v in res.values()], [end_of_string_position])) == set( - range(len(tokenized_sentence_tokens)) - ), print( - set(range(len(tokenized_sentence_tokens))) - - set(sum([v for v in res.values()], [end_of_string_position])) - ) # verify if all tokens are used - - return res - - -# at text level -def compute_preprocessed_word_score( - preprocessed_text, - tokenized_text_tokens, - scores, - id_to_token_dicts, - token_to_id_dicts, - min_n, - padding_index=2009603, - end_of_string_index=0, -): - """ - Compute preprocessed word scores based on token scores. - - Args: - preprocessed_text (List[str]): List of preprocessed sentences. - tokenized_text (List[List[int]]): For each sentence, list of token IDs. - scores (List[torch.Tensor]): For each sentence, list of token scores. - id_to_token_dicts (List[Dict[int, str]]): For each sentence, mapping from token ID to token in string form. - token_to_id_dicts (List[Dict[str, int]]): For each sentence, mapping from token (string) to token ID. - padding_index (int): Index of padding token. - end_of_string_index (int): Index of end of string token. - aggregate (bool): Whether to aggregate scores at word level (if False, stay at token level). - - Returns: - List[Dict[str, float]]: For each sentence, mapping from preprocessed word to score. - """ - - word_to_score_dicts = [] - word_to_token_idx_dicts = [] - - for idx, sentence in enumerate(preprocessed_text): - tokenized_sentence_tokens = tokenized_text_tokens[idx] # sentence level, List[str] - word_to_token_idx = match_word_to_token_indexes(sentence, tokenized_sentence_tokens, min_n) - score_sentence_topk = scores[idx] # torch.Tensor, token scores, (top_k, seq_len) - - # Calculate the score for each token and map to words - word_to_score_topk = [] - for k in range(len(score_sentence_topk)): - # Initialize word-to-score dictionary with zero values - word_to_score = {word: 0 for word in sentence.split()} - - score_sentence = score_sentence_topk[k] - for word, associated_token_idx in word_to_token_idx.items(): - associated_token_idx = torch.tensor(associated_token_idx).int() - word_to_score[word] = torch.sum(score_sentence[associated_token_idx]).item() - - word_to_score_topk.append(word_to_score.copy()) - - word_to_score_dicts.append(word_to_score_topk) - word_to_token_idx_dicts.append(word_to_token_idx) - - return word_to_score_dicts, word_to_token_idx_dicts - - -def compute_word_score(word_to_score_dicts, text, n=5, cutoff=0.75): - """ - Compute word scores based on preprocessed word scores. - - Args: - word_to_score_dicts (List[List[Dict[str, float]]]): For each sentence, list of top_k mappings from preprocessed word to score. - text (List[str]): List of sentences. - n (int): Number of closest preprocessed words to consider for a given original word. - cutoff (float): Minimum similarity score for a match. - - Returns: - List[List[List[float]]]: For each sentence, list of top-k scores for each word. - """ - - all_scores_text = [] - mappings = [] - for idx, word_to_score_topk in enumerate(word_to_score_dicts): # iteration over sentences - all_scores_topk = [] - processed_words = list(word_to_score_topk[0].keys()) - original_words = text[idx].split() - original_words = list(filter(lambda x: x != ",", original_words)) - mapping = map_processed_to_original( - processed_words, original_words, n=n, cutoff=cutoff - ) # Dict[str, Tuple[List[str], List[float]]] - mappings.append(mapping) - for word_to_score in word_to_score_topk: # iteration over top_k (the preds) - scores = [] - stopwords_idx = [] - for pos_word, word in enumerate(original_words): - if word not in mapping: - scores.append(0) - stopwords_idx.append(pos_word) - continue - matching_processed_word = mapping[word] - word_score = word_to_score[matching_processed_word] - scores.append(word_score) - - scores = torch.tensor(scores) - scores = F.softmax( - scores, dim=-1 - ) # softmax normalization. Length = len(original_words) - scores[stopwords_idx] = 0 - - all_scores_topk.append(scores) # length top_k - - all_scores_text.append(all_scores_topk) # length = len(text) - - return all_scores_text, mappings - - -def explain_continuous( - text, processed_text, tokenized_text_tokens, mappings, word_to_token_idx_dicts, all_attr, top_k -): - """ - Score explanation at letter level. - - Args: - text (List[str]): List of original sentences. - processed_text (List[str]): List of preprocessed sentences. - tokenized_text_tokens (List[List[str]]): List of tokenized sentences. - mappings (List[Dict[str, str]]): List of mappings from original word to preprocessed word. - word_to_token_idx_dicts (List[Dict[str, List[int]]]): List of mappings from preprocessed word to token indexes. - all_attr (torch.Tensor): Tensor of token scores. - top_k (int): Number of top tokens to consider. - - Returns: - List[torch.Tensor]: List of letter scores for each sentence. - - - """ - all_scores_text = [] - for idx, processed_sentence in enumerate(processed_text): - tokenized_sentence_tokens = tokenized_text_tokens[idx] - mapping = mappings[idx] - word_to_token_idx = word_to_token_idx_dicts[idx] - original_words = text[idx].split() - original_words = list(filter(lambda x: x != ",", original_words)) - - original_to_token = {} - original_to_token_idxs = {} - - for original in original_words: - # original = original.replace(',', '') - if original not in mapping: - continue - - matching_processed_word = mapping[original] - associated_token_idx = word_to_token_idx[matching_processed_word] - original_to_token[original] = [ - tokenized_sentence_tokens[token_idx] for token_idx in associated_token_idx - ] - original_to_token_idxs[original] = associated_token_idx - - scores_for_k = [] - for k in range(top_k): - scores_for_words = [] - for xxx, original_word in enumerate(original_words): - original_word_prepro = clean_text_feature([original_word], remove_stop_words=False)[ - 0 - ] - - letters = list(original_word) - scores_letter = torch.zeros(len(letters), dtype=torch.float32) - - if original_word not in original_to_token: # if stopword, 0 - scores_for_words.append(scores_letter) - continue - - for pos, token in enumerate(original_to_token[original_word]): - pos_token = original_to_token_idxs[original_word][pos] - # tok = preprocess_token(token)[0] - tok = preprocess_token(token) - score_token = all_attr[idx, k, pos_token].item() - - # Embed the token at the right indexes of the word - sm = SequenceMatcher(None, original_word_prepro, tok) - a, _, size = sm.find_longest_match() - scores_letter[a : a + size] += score_token - - scores_for_words.append(scores_letter) - - all_scores_letter = torch.cat(scores_for_words) - scores = F.softmax(all_scores_letter, dim=-1) - scores[all_scores_letter == 0] = 0 - scores_for_k.append(scores) - - scores_for_sentence = torch.stack(scores_for_k) - all_scores_text.append(scores_for_sentence) - - return torch.stack(all_scores_text) diff --git a/uv.lock b/uv.lock index ee61acd..627d823 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.11" resolution-markers = [ "python_full_version >= '3.12'", @@ -104,6 +104,29 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929, upload-time = "2024-07-26T18:15:02.05Z" }, ] +[[package]] +name = "anyio" +version = "4.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "sniffio" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c6/78/7d432127c41b50bccba979505f272c16cbcadcc33645d5fa3a738110ae75/anyio-4.11.0.tar.gz", hash = "sha256:82a8d0b81e318cc5ce71a5f1f8b5c4e63619620b63141ef8c995fa0db95a57c4", size = 219094, upload-time = "2025-09-23T09:19:12.58Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/15/b3/9b1a8074496371342ec1e796a96f99c82c945a339cd81a8e73de28b4cf9e/anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc", size = 109097, upload-time = "2025-09-23T09:19:10.601Z" }, +] + +[[package]] +name = "asttokens" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978, upload-time = "2024-11-30T04:30:14.439Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918, upload-time = "2024-11-30T04:30:10.946Z" }, +] + [[package]] name = "attrs" version = "25.3.0" @@ -146,6 +169,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload-time = "2025-06-15T02:45:49.977Z" }, ] +[[package]] +name = "cfgv" +version = "3.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560", size = 7114, upload-time = "2023-08-12T20:38:17.776Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249, upload-time = "2023-08-12T20:38:16.269Z" }, +] + [[package]] name = "charset-normalizer" version = "3.4.2" @@ -215,6 +247,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "comm" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4c/13/7d740c5849255756bc17888787313b61fd38a0a8304fc4f073dfc46122aa/comm-0.2.3.tar.gz", hash = "sha256:2dc8048c10962d55d7ad693be1e7045d891b7ce8d999c97963a5e3e99c055971", size = 6319, upload-time = "2025-07-25T14:02:04.452Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" }, +] + [[package]] name = "contourpy" version = "1.3.2" @@ -278,6 +319,58 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, ] +[[package]] +name = "datasets" +version = "4.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, + { name = "filelock" }, + { name = "fsspec", extra = ["http"] }, + { name = "httpx" }, + { name = "huggingface-hub" }, + { name = "multiprocess" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pandas" }, + { name = "pyarrow" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "xxhash" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2a/47/325206ac160f7699ed9f1798afa8f8f8d5189b03bf3815654859ac1d5cba/datasets-4.3.0.tar.gz", hash = "sha256:bc9118ed9afd92346c5be7ed3aaa00177eb907c25467f9d072a0d22777efbd2b", size = 582801, upload-time = "2025-10-23T16:31:51.547Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/51/409a8184ed35453d9cbb3d6b20d524b1115c2c2d117b85d5e9b06cd70b45/datasets-4.3.0-py3-none-any.whl", hash = "sha256:0ea157e72138b3ca6c7d2415f19a164ecf7d4c4fa72da2a570da286882e96903", size = 506846, upload-time = "2025-10-23T16:31:49.965Z" }, +] + +[[package]] +name = "decorator" +version = "5.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, +] + +[[package]] +name = "dill" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/12/80/630b4b88364e9a8c8c5797f4602d0f76ef820909ee32f0bacb9f90654042/dill-0.4.0.tar.gz", hash = "sha256:0633f1d2df477324f53a895b02c901fb961bdbf65a17122586ea7019292cbcf0", size = 186976, upload-time = "2025-04-16T00:41:48.867Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668, upload-time = "2025-04-16T00:41:47.671Z" }, +] + +[[package]] +name = "distlib" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/96/8e/709914eb2b5749865801041647dc7f4e6d00b549cfe88b65ca192995f07c/distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d", size = 614605, upload-time = "2025-07-17T16:52:00.465Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, +] + [[package]] name = "docutils" version = "0.21.2" @@ -287,6 +380,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408, upload-time = "2024-04-23T18:57:14.835Z" }, ] +[[package]] +name = "executing" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cc/28/c14e053b6762b1044f34a13aab6859bbf40456d37d23aa286ac24cfd9a5d/executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4", size = 1129488, upload-time = "2025-09-01T09:48:10.866Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, +] + [[package]] name = "filelock" version = "3.18.0" @@ -420,6 +522,86 @@ http = [ { name = "aiohttp" }, ] +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, +] + +[[package]] +name = "hf-xet" +version = "1.1.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/74/31/feeddfce1748c4a233ec1aa5b7396161c07ae1aa9b7bdbc9a72c3c7dd768/hf_xet-1.1.10.tar.gz", hash = "sha256:408aef343800a2102374a883f283ff29068055c111f003ff840733d3b715bb97", size = 487910, upload-time = "2025-09-12T20:10:27.12Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/a2/343e6d05de96908366bdc0081f2d8607d61200be2ac802769c4284cc65bd/hf_xet-1.1.10-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:686083aca1a6669bc85c21c0563551cbcdaa5cf7876a91f3d074a030b577231d", size = 2761466, upload-time = "2025-09-12T20:10:22.836Z" }, + { url = "https://files.pythonhosted.org/packages/31/f9/6215f948ac8f17566ee27af6430ea72045e0418ce757260248b483f4183b/hf_xet-1.1.10-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:71081925383b66b24eedff3013f8e6bbd41215c3338be4b94ba75fd75b21513b", size = 2623807, upload-time = "2025-09-12T20:10:21.118Z" }, + { url = "https://files.pythonhosted.org/packages/15/07/86397573efefff941e100367bbda0b21496ffcdb34db7ab51912994c32a2/hf_xet-1.1.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b6bceb6361c80c1cc42b5a7b4e3efd90e64630bcf11224dcac50ef30a47e435", size = 3186960, upload-time = "2025-09-12T20:10:19.336Z" }, + { url = "https://files.pythonhosted.org/packages/01/a7/0b2e242b918cc30e1f91980f3c4b026ff2eedaf1e2ad96933bca164b2869/hf_xet-1.1.10-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:eae7c1fc8a664e54753ffc235e11427ca61f4b0477d757cc4eb9ae374b69f09c", size = 3087167, upload-time = "2025-09-12T20:10:17.255Z" }, + { url = "https://files.pythonhosted.org/packages/4a/25/3e32ab61cc7145b11eee9d745988e2f0f4fafda81b25980eebf97d8cff15/hf_xet-1.1.10-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0a0005fd08f002180f7a12d4e13b22be277725bc23ed0529f8add5c7a6309c06", size = 3248612, upload-time = "2025-09-12T20:10:24.093Z" }, + { url = "https://files.pythonhosted.org/packages/2c/3d/ab7109e607ed321afaa690f557a9ada6d6d164ec852fd6bf9979665dc3d6/hf_xet-1.1.10-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f900481cf6e362a6c549c61ff77468bd59d6dd082f3170a36acfef2eb6a6793f", size = 3353360, upload-time = "2025-09-12T20:10:25.563Z" }, + { url = "https://files.pythonhosted.org/packages/ee/0e/471f0a21db36e71a2f1752767ad77e92d8cde24e974e03d662931b1305ec/hf_xet-1.1.10-cp37-abi3-win_amd64.whl", hash = "sha256:5f54b19cc347c13235ae7ee98b330c26dd65ef1df47e5316ffb1e87713ca7045", size = 2804691, upload-time = "2025-09-12T20:10:28.433Z" }, +] + +[[package]] +name = "httpcore" +version = "1.0.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, +] + +[[package]] +name = "huggingface-hub" +version = "0.35.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/10/7e/a0a97de7c73671863ca6b3f61fa12518caf35db37825e43d63a70956738c/huggingface_hub-0.35.3.tar.gz", hash = "sha256:350932eaa5cc6a4747efae85126ee220e4ef1b54e29d31c3b45c5612ddf0b32a", size = 461798, upload-time = "2025-09-29T14:29:58.625Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/a0/651f93d154cb72323358bf2bbae3e642bdb5d2f1bfc874d096f7cb159fa0/huggingface_hub-0.35.3-py3-none-any.whl", hash = "sha256:0e3a01829c19d86d03793e4577816fe3bdfc1602ac62c7fb220d593d351224ba", size = 564262, upload-time = "2025-09-29T14:29:55.813Z" }, +] + +[[package]] +name = "identify" +version = "2.6.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ff/e7/685de97986c916a6d93b3876139e00eef26ad5bbbd61925d670ae8013449/identify-2.6.15.tar.gz", hash = "sha256:e4f4864b96c6557ef2a1e1c951771838f4edc9df3a72ec7118b338801b11c7bf", size = 99311, upload-time = "2025-10-02T17:43:40.631Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/1c/e5fd8f973d4f375adb21565739498e2e9a1e54c858a97b9a8ccfdc81da9b/identify-2.6.15-py2.py3-none-any.whl", hash = "sha256:1181ef7608e00704db228516541eb83a88a9f94433a8c80bb9b5bd54b1d81757", size = 99183, upload-time = "2025-10-02T17:43:39.137Z" }, +] + [[package]] name = "idna" version = "3.10" @@ -447,6 +629,68 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, ] +[[package]] +name = "ipython" +version = "9.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "decorator" }, + { name = "ipython-pygments-lexers" }, + { name = "jedi" }, + { name = "matplotlib-inline" }, + { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, + { name = "prompt-toolkit" }, + { name = "pygments" }, + { name = "stack-data" }, + { name = "traitlets" }, + { name = "typing-extensions", marker = "python_full_version < '3.12'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/29/e6/48c74d54039241a456add616464ea28c6ebf782e4110d419411b83dae06f/ipython-9.7.0.tar.gz", hash = "sha256:5f6de88c905a566c6a9d6c400a8fed54a638e1f7543d17aae2551133216b1e4e", size = 4422115, upload-time = "2025-11-05T12:18:54.646Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/aa/62893d6a591d337aa59dcc4c6f6c842f1fe20cd72c8c5c1f980255243252/ipython-9.7.0-py3-none-any.whl", hash = "sha256:bce8ac85eb9521adc94e1845b4c03d88365fd6ac2f4908ec4ed1eb1b0a065f9f", size = 618911, upload-time = "2025-11-05T12:18:52.484Z" }, +] + +[[package]] +name = "ipython-pygments-lexers" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" }, +] + +[[package]] +name = "ipywidgets" +version = "8.1.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "comm" }, + { name = "ipython" }, + { name = "jupyterlab-widgets" }, + { name = "traitlets" }, + { name = "widgetsnbextension" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4c/ae/c5ce1edc1afe042eadb445e95b0671b03cee61895264357956e61c0d2ac0/ipywidgets-8.1.8.tar.gz", hash = "sha256:61f969306b95f85fba6b6986b7fe45d73124d1d9e3023a8068710d47a22ea668", size = 116739, upload-time = "2025-11-01T21:18:12.393Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/56/6d/0d9848617b9f753b87f214f1c682592f7ca42de085f564352f10f0843026/ipywidgets-8.1.8-py3-none-any.whl", hash = "sha256:ecaca67aed704a338f88f67b1181b58f821ab5dc89c1f0f5ef99db43c1c2921e", size = 139808, upload-time = "2025-11-01T21:18:10.956Z" }, +] + +[[package]] +name = "jedi" +version = "0.19.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "parso" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -468,6 +712,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7d/4f/1195bbac8e0c2acc5f740661631d8d750dc38d4a32b23ee5df3cde6f4e0d/joblib-1.5.1-py3-none-any.whl", hash = "sha256:4719a31f054c7d766948dcd83e9613686b27114f190f717cec7eaa2084f8a74a", size = 307746, upload-time = "2025-05-23T12:04:35.124Z" }, ] +[[package]] +name = "jupyterlab-widgets" +version = "3.0.16" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/2d/ef58fed122b268c69c0aa099da20bc67657cdfb2e222688d5731bd5b971d/jupyterlab_widgets-3.0.16.tar.gz", hash = "sha256:423da05071d55cf27a9e602216d35a3a65a3e41cdf9c5d3b643b814ce38c19e0", size = 897423, upload-time = "2025-11-01T21:11:29.724Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/b5/36c712098e6191d1b4e349304ef73a8d06aed77e56ceaac8c0a306c7bda1/jupyterlab_widgets-3.0.16-py3-none-any.whl", hash = "sha256:45fa36d9c6422cf2559198e4db481aa243c7a32d9926b500781c830c80f7ecf8", size = 914926, upload-time = "2025-11-01T21:11:28.008Z" }, +] + [[package]] name = "kiwisolver" version = "1.4.8" @@ -651,6 +904,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1b/92/9a45c91089c3cf690b5badd4be81e392ff086ccca8a1d4e3a08463d8a966/matplotlib-3.10.3-cp313-cp313t-win_amd64.whl", hash = "sha256:4f23ffe95c5667ef8a2b56eea9b53db7f43910fa4a2d5472ae0f72b64deab4d5", size = 8139044, upload-time = "2025-05-08T19:10:44.551Z" }, ] +[[package]] +name = "matplotlib-inline" +version = "0.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" }, +] + [[package]] name = "mdit-py-plugins" version = "0.4.2" @@ -758,6 +1023,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/5d/e17845bb0fa76334477d5de38654d27946d5b5d3695443987a094a71b440/multidict-6.4.4-py3-none-any.whl", hash = "sha256:bd4557071b561a8b3b6075c3ce93cf9bfb6182cb241805c3d66ced3b75eff4ac", size = 10481, upload-time = "2025-05-19T14:16:36.024Z" }, ] +[[package]] +name = "multiprocess" +version = "0.70.16" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "dill" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603, upload-time = "2024-01-28T18:52:34.85Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824, upload-time = "2024-01-28T18:52:26.062Z" }, + { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519, upload-time = "2024-01-28T18:52:28.115Z" }, + { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741, upload-time = "2024-01-28T18:52:29.395Z" }, + { url = "https://files.pythonhosted.org/packages/ea/89/38df130f2c799090c978b366cfdf5b96d08de5b29a4a293df7f7429fa50b/multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435", size = 132628, upload-time = "2024-01-28T18:52:30.853Z" }, + { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351, upload-time = "2024-01-28T18:52:31.981Z" }, +] + [[package]] name = "myst-parser" version = "4.0.1" @@ -799,6 +1080,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4d/66/7d9e26593edda06e8cb531874633f7c2372279c3b0f46235539fe546df8b/nltk-3.9.1-py3-none-any.whl", hash = "sha256:4fa26829c5b00715afe3061398a8989dc643b92ce7dd93fb4585a70930d168a1", size = 1505442, upload-time = "2024-08-18T19:48:21.909Z" }, ] +[[package]] +name = "nodeenv" +version = "1.9.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, +] + [[package]] name = "numpy" version = "2.2.6" @@ -1030,6 +1320,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436, upload-time = "2024-09-20T13:09:48.112Z" }, ] +[[package]] +name = "parso" +version = "0.8.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d4/de/53e0bcf53d13e005bd8c92e7855142494f41171b34c2536b86187474184d/parso-0.8.5.tar.gz", hash = "sha256:034d7354a9a018bdce352f48b2a8a450f05e9d6ee85db84764e9b6bd96dafe5a", size = 401205, upload-time = "2025-08-23T15:15:28.028Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/32/f8e3c85d1d5250232a5d3477a2a28cc291968ff175caeadaf3cc19ce0e4a/parso-0.8.5-py2.py3-none-any.whl", hash = "sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887", size = 106668, upload-time = "2025-08-23T15:15:25.663Z" }, +] + +[[package]] +name = "pexpect" +version = "4.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ptyprocess" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" }, +] + [[package]] name = "pillow" version = "11.2.1" @@ -1089,6 +1400,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/21/2c/5e05f58658cf49b6667762cca03d6e7d85cededde2caf2ab37b81f80e574/pillow-11.2.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:208653868d5c9ecc2b327f9b9ef34e0e42a4cdd172c2988fd81d62d2bc9bc044", size = 2674751, upload-time = "2025-04-12T17:49:59.628Z" }, ] +[[package]] +name = "platformdirs" +version = "4.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/61/33/9611380c2bdb1225fdef633e2a9610622310fed35ab11dac9620972ee088/platformdirs-4.5.0.tar.gz", hash = "sha256:70ddccdd7c99fc5942e9fc25636a8b34d04c24b335100223152c2803e4063312", size = 21632, upload-time = "2025-10-08T17:44:48.791Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/73/cb/ac7874b3e5d58441674fb70742e6c374b28b0c7cb988d37d991cde47166c/platformdirs-4.5.0-py3-none-any.whl", hash = "sha256:e578a81bb873cbb89a41fcc904c7ef523cc18284b7e3b3ccf06aca1403b7ebd3", size = 18651, upload-time = "2025-10-08T17:44:47.223Z" }, +] + [[package]] name = "pluggy" version = "1.6.0" @@ -1110,6 +1430,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/2f/a4583c70fbd8cd04910e2884bcc2bdd670e884061f7b4d70bc13e632a993/pockets-0.9.1-py2.py3-none-any.whl", hash = "sha256:68597934193c08a08eb2bf6a1d85593f627c22f9b065cc727a4f03f669d96d86", size = 26263, upload-time = "2019-11-02T14:46:17.814Z" }, ] +[[package]] +name = "pre-commit" +version = "4.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cfgv" }, + { name = "identify" }, + { name = "nodeenv" }, + { name = "pyyaml" }, + { name = "virtualenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/29/7cf5bbc236333876e4b41f56e06857a87937ce4bf91e117a6991a2dbb02a/pre_commit-4.3.0.tar.gz", hash = "sha256:499fe450cc9d42e9d58e606262795ecb64dd05438943c62b66f6a8673da30b16", size = 193792, upload-time = "2025-08-09T18:56:14.651Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/a5/987a405322d78a73b66e39e4a90e4ef156fd7141bf71df987e50717c321b/pre_commit-4.3.0-py2.py3-none-any.whl", hash = "sha256:2b0747ad7e6e967169136edffee14c16e148a778a54e4f967921aa1ebf2308d8", size = 220965, upload-time = "2025-08-09T18:56:13.192Z" }, +] + +[[package]] +name = "prompt-toolkit" +version = "3.0.52" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, +] + [[package]] name = "propcache" version = "0.3.1" @@ -1183,48 +1531,72 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b8/d3/c3cb8f1d6ae3b37f83e1de806713a9b3642c5895f0215a62e1a4bd6e5e34/propcache-0.3.1-py3-none-any.whl", hash = "sha256:9a8ecf38de50a7f518c21568c80f985e776397b902f1ce0b01f799aba1608b40", size = 12376, upload-time = "2025-03-26T03:06:10.5Z" }, ] +[[package]] +name = "ptyprocess" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" }, +] + +[[package]] +name = "pure-eval" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, +] + [[package]] name = "pyarrow" -version = "20.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a2/ee/a7810cb9f3d6e9238e61d312076a9859bf3668fd21c69744de9532383912/pyarrow-20.0.0.tar.gz", hash = "sha256:febc4a913592573c8d5805091a6c2b5064c8bd6e002131f01061797d91c783c1", size = 1125187, upload-time = "2025-04-27T12:34:23.264Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/47/a2/b7930824181ceadd0c63c1042d01fa4ef63eee233934826a7a2a9af6e463/pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:24ca380585444cb2a31324c546a9a56abbe87e26069189e14bdba19c86c049f0", size = 30856035, upload-time = "2025-04-27T12:28:40.78Z" }, - { url = "https://files.pythonhosted.org/packages/9b/18/c765770227d7f5bdfa8a69f64b49194352325c66a5c3bb5e332dfd5867d9/pyarrow-20.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:95b330059ddfdc591a3225f2d272123be26c8fa76e8c9ee1a77aad507361cfdb", size = 32309552, upload-time = "2025-04-27T12:28:47.051Z" }, - { url = "https://files.pythonhosted.org/packages/44/fb/dfb2dfdd3e488bb14f822d7335653092dde150cffc2da97de6e7500681f9/pyarrow-20.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f0fb1041267e9968c6d0d2ce3ff92e3928b243e2b6d11eeb84d9ac547308232", size = 41334704, upload-time = "2025-04-27T12:28:55.064Z" }, - { url = "https://files.pythonhosted.org/packages/58/0d/08a95878d38808051a953e887332d4a76bc06c6ee04351918ee1155407eb/pyarrow-20.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8ff87cc837601532cc8242d2f7e09b4e02404de1b797aee747dd4ba4bd6313f", size = 42399836, upload-time = "2025-04-27T12:29:02.13Z" }, - { url = "https://files.pythonhosted.org/packages/f3/cd/efa271234dfe38f0271561086eedcad7bc0f2ddd1efba423916ff0883684/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:7a3a5dcf54286e6141d5114522cf31dd67a9e7c9133d150799f30ee302a7a1ab", size = 40711789, upload-time = "2025-04-27T12:29:09.951Z" }, - { url = "https://files.pythonhosted.org/packages/46/1f/7f02009bc7fc8955c391defee5348f510e589a020e4b40ca05edcb847854/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a6ad3e7758ecf559900261a4df985662df54fb7fdb55e8e3b3aa99b23d526b62", size = 42301124, upload-time = "2025-04-27T12:29:17.187Z" }, - { url = "https://files.pythonhosted.org/packages/4f/92/692c562be4504c262089e86757a9048739fe1acb4024f92d39615e7bab3f/pyarrow-20.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6bb830757103a6cb300a04610e08d9636f0cd223d32f388418ea893a3e655f1c", size = 42916060, upload-time = "2025-04-27T12:29:24.253Z" }, - { url = "https://files.pythonhosted.org/packages/a4/ec/9f5c7e7c828d8e0a3c7ef50ee62eca38a7de2fa6eb1b8fa43685c9414fef/pyarrow-20.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:96e37f0766ecb4514a899d9a3554fadda770fb57ddf42b63d80f14bc20aa7db3", size = 44547640, upload-time = "2025-04-27T12:29:32.782Z" }, - { url = "https://files.pythonhosted.org/packages/54/96/46613131b4727f10fd2ffa6d0d6f02efcc09a0e7374eff3b5771548aa95b/pyarrow-20.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:3346babb516f4b6fd790da99b98bed9708e3f02e734c84971faccb20736848dc", size = 25781491, upload-time = "2025-04-27T12:29:38.464Z" }, - { url = "https://files.pythonhosted.org/packages/a1/d6/0c10e0d54f6c13eb464ee9b67a68b8c71bcf2f67760ef5b6fbcddd2ab05f/pyarrow-20.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:75a51a5b0eef32727a247707d4755322cb970be7e935172b6a3a9f9ae98404ba", size = 30815067, upload-time = "2025-04-27T12:29:44.384Z" }, - { url = "https://files.pythonhosted.org/packages/7e/e2/04e9874abe4094a06fd8b0cbb0f1312d8dd7d707f144c2ec1e5e8f452ffa/pyarrow-20.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:211d5e84cecc640c7a3ab900f930aaff5cd2702177e0d562d426fb7c4f737781", size = 32297128, upload-time = "2025-04-27T12:29:52.038Z" }, - { url = "https://files.pythonhosted.org/packages/31/fd/c565e5dcc906a3b471a83273039cb75cb79aad4a2d4a12f76cc5ae90a4b8/pyarrow-20.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ba3cf4182828be7a896cbd232aa8dd6a31bd1f9e32776cc3796c012855e1199", size = 41334890, upload-time = "2025-04-27T12:29:59.452Z" }, - { url = "https://files.pythonhosted.org/packages/af/a9/3bdd799e2c9b20c1ea6dc6fa8e83f29480a97711cf806e823f808c2316ac/pyarrow-20.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c3a01f313ffe27ac4126f4c2e5ea0f36a5fc6ab51f8726cf41fee4b256680bd", size = 42421775, upload-time = "2025-04-27T12:30:06.875Z" }, - { url = "https://files.pythonhosted.org/packages/10/f7/da98ccd86354c332f593218101ae56568d5dcedb460e342000bd89c49cc1/pyarrow-20.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a2791f69ad72addd33510fec7bb14ee06c2a448e06b649e264c094c5b5f7ce28", size = 40687231, upload-time = "2025-04-27T12:30:13.954Z" }, - { url = "https://files.pythonhosted.org/packages/bb/1b/2168d6050e52ff1e6cefc61d600723870bf569cbf41d13db939c8cf97a16/pyarrow-20.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:4250e28a22302ce8692d3a0e8ec9d9dde54ec00d237cff4dfa9c1fbf79e472a8", size = 42295639, upload-time = "2025-04-27T12:30:21.949Z" }, - { url = "https://files.pythonhosted.org/packages/b2/66/2d976c0c7158fd25591c8ca55aee026e6d5745a021915a1835578707feb3/pyarrow-20.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:89e030dc58fc760e4010148e6ff164d2f44441490280ef1e97a542375e41058e", size = 42908549, upload-time = "2025-04-27T12:30:29.551Z" }, - { url = "https://files.pythonhosted.org/packages/31/a9/dfb999c2fc6911201dcbf348247f9cc382a8990f9ab45c12eabfd7243a38/pyarrow-20.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6102b4864d77102dbbb72965618e204e550135a940c2534711d5ffa787df2a5a", size = 44557216, upload-time = "2025-04-27T12:30:36.977Z" }, - { url = "https://files.pythonhosted.org/packages/a0/8e/9adee63dfa3911be2382fb4d92e4b2e7d82610f9d9f668493bebaa2af50f/pyarrow-20.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:96d6a0a37d9c98be08f5ed6a10831d88d52cac7b13f5287f1e0f625a0de8062b", size = 25660496, upload-time = "2025-04-27T12:30:42.809Z" }, - { url = "https://files.pythonhosted.org/packages/9b/aa/daa413b81446d20d4dad2944110dcf4cf4f4179ef7f685dd5a6d7570dc8e/pyarrow-20.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:a15532e77b94c61efadde86d10957950392999503b3616b2ffcef7621a002893", size = 30798501, upload-time = "2025-04-27T12:30:48.351Z" }, - { url = "https://files.pythonhosted.org/packages/ff/75/2303d1caa410925de902d32ac215dc80a7ce7dd8dfe95358c165f2adf107/pyarrow-20.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:dd43f58037443af715f34f1322c782ec463a3c8a94a85fdb2d987ceb5658e061", size = 32277895, upload-time = "2025-04-27T12:30:55.238Z" }, - { url = "https://files.pythonhosted.org/packages/92/41/fe18c7c0b38b20811b73d1bdd54b1fccba0dab0e51d2048878042d84afa8/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa0d288143a8585806e3cc7c39566407aab646fb9ece164609dac1cfff45f6ae", size = 41327322, upload-time = "2025-04-27T12:31:05.587Z" }, - { url = "https://files.pythonhosted.org/packages/da/ab/7dbf3d11db67c72dbf36ae63dcbc9f30b866c153b3a22ef728523943eee6/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6953f0114f8d6f3d905d98e987d0924dabce59c3cda380bdfaa25a6201563b4", size = 42411441, upload-time = "2025-04-27T12:31:15.675Z" }, - { url = "https://files.pythonhosted.org/packages/90/c3/0c7da7b6dac863af75b64e2f827e4742161128c350bfe7955b426484e226/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:991f85b48a8a5e839b2128590ce07611fae48a904cae6cab1f089c5955b57eb5", size = 40677027, upload-time = "2025-04-27T12:31:24.631Z" }, - { url = "https://files.pythonhosted.org/packages/be/27/43a47fa0ff9053ab5203bb3faeec435d43c0d8bfa40179bfd076cdbd4e1c/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:97c8dc984ed09cb07d618d57d8d4b67a5100a30c3818c2fb0b04599f0da2de7b", size = 42281473, upload-time = "2025-04-27T12:31:31.311Z" }, - { url = "https://files.pythonhosted.org/packages/bc/0b/d56c63b078876da81bbb9ba695a596eabee9b085555ed12bf6eb3b7cab0e/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9b71daf534f4745818f96c214dbc1e6124d7daf059167330b610fc69b6f3d3e3", size = 42893897, upload-time = "2025-04-27T12:31:39.406Z" }, - { url = "https://files.pythonhosted.org/packages/92/ac/7d4bd020ba9145f354012838692d48300c1b8fe5634bfda886abcada67ed/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e8b88758f9303fa5a83d6c90e176714b2fd3852e776fc2d7e42a22dd6c2fb368", size = 44543847, upload-time = "2025-04-27T12:31:45.997Z" }, - { url = "https://files.pythonhosted.org/packages/9d/07/290f4abf9ca702c5df7b47739c1b2c83588641ddfa2cc75e34a301d42e55/pyarrow-20.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:30b3051b7975801c1e1d387e17c588d8ab05ced9b1e14eec57915f79869b5031", size = 25653219, upload-time = "2025-04-27T12:31:54.11Z" }, - { url = "https://files.pythonhosted.org/packages/95/df/720bb17704b10bd69dde086e1400b8eefb8f58df3f8ac9cff6c425bf57f1/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:ca151afa4f9b7bc45bcc791eb9a89e90a9eb2772767d0b1e5389609c7d03db63", size = 30853957, upload-time = "2025-04-27T12:31:59.215Z" }, - { url = "https://files.pythonhosted.org/packages/d9/72/0d5f875efc31baef742ba55a00a25213a19ea64d7176e0fe001c5d8b6e9a/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:4680f01ecd86e0dd63e39eb5cd59ef9ff24a9d166db328679e36c108dc993d4c", size = 32247972, upload-time = "2025-04-27T12:32:05.369Z" }, - { url = "https://files.pythonhosted.org/packages/d5/bc/e48b4fa544d2eea72f7844180eb77f83f2030b84c8dad860f199f94307ed/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f4c8534e2ff059765647aa69b75d6543f9fef59e2cd4c6d18015192565d2b70", size = 41256434, upload-time = "2025-04-27T12:32:11.814Z" }, - { url = "https://files.pythonhosted.org/packages/c3/01/974043a29874aa2cf4f87fb07fd108828fc7362300265a2a64a94965e35b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e1f8a47f4b4ae4c69c4d702cfbdfe4d41e18e5c7ef6f1bb1c50918c1e81c57b", size = 42353648, upload-time = "2025-04-27T12:32:20.766Z" }, - { url = "https://files.pythonhosted.org/packages/68/95/cc0d3634cde9ca69b0e51cbe830d8915ea32dda2157560dda27ff3b3337b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:a1f60dc14658efaa927f8214734f6a01a806d7690be4b3232ba526836d216122", size = 40619853, upload-time = "2025-04-27T12:32:28.1Z" }, - { url = "https://files.pythonhosted.org/packages/29/c2/3ad40e07e96a3e74e7ed7cc8285aadfa84eb848a798c98ec0ad009eb6bcc/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:204a846dca751428991346976b914d6d2a82ae5b8316a6ed99789ebf976551e6", size = 42241743, upload-time = "2025-04-27T12:32:35.792Z" }, - { url = "https://files.pythonhosted.org/packages/eb/cb/65fa110b483339add6a9bc7b6373614166b14e20375d4daa73483755f830/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f3b117b922af5e4c6b9a9115825726cac7d8b1421c37c2b5e24fbacc8930612c", size = 42839441, upload-time = "2025-04-27T12:32:46.64Z" }, - { url = "https://files.pythonhosted.org/packages/98/7b/f30b1954589243207d7a0fbc9997401044bf9a033eec78f6cb50da3f304a/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e724a3fd23ae5b9c010e7be857f4405ed5e679db5c93e66204db1a69f733936a", size = 44503279, upload-time = "2025-04-27T12:32:56.503Z" }, - { url = "https://files.pythonhosted.org/packages/37/40/ad395740cd641869a13bcf60851296c89624662575621968dcfafabaa7f6/pyarrow-20.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:82f1ee5133bd8f49d31be1299dc07f585136679666b502540db854968576faf9", size = 25944982, upload-time = "2025-04-27T12:33:04.72Z" }, +version = "22.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/30/53/04a7fdc63e6056116c9ddc8b43bc28c12cdd181b85cbeadb79278475f3ae/pyarrow-22.0.0.tar.gz", hash = "sha256:3d600dc583260d845c7d8a6db540339dd883081925da2bd1c5cb808f720b3cd9", size = 1151151, upload-time = "2025-10-24T12:30:00.762Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/b7/18f611a8cdc43417f9394a3ccd3eace2f32183c08b9eddc3d17681819f37/pyarrow-22.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:3e294c5eadfb93d78b0763e859a0c16d4051fc1c5231ae8956d61cb0b5666f5a", size = 34272022, upload-time = "2025-10-24T10:04:28.973Z" }, + { url = "https://files.pythonhosted.org/packages/26/5c/f259e2526c67eb4b9e511741b19870a02363a47a35edbebc55c3178db22d/pyarrow-22.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:69763ab2445f632d90b504a815a2a033f74332997052b721002298ed6de40f2e", size = 35995834, upload-time = "2025-10-24T10:04:35.467Z" }, + { url = "https://files.pythonhosted.org/packages/50/8d/281f0f9b9376d4b7f146913b26fac0aa2829cd1ee7e997f53a27411bbb92/pyarrow-22.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:b41f37cabfe2463232684de44bad753d6be08a7a072f6a83447eeaf0e4d2a215", size = 45030348, upload-time = "2025-10-24T10:04:43.366Z" }, + { url = "https://files.pythonhosted.org/packages/f5/e5/53c0a1c428f0976bf22f513d79c73000926cb00b9c138d8e02daf2102e18/pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:35ad0f0378c9359b3f297299c3309778bb03b8612f987399a0333a560b43862d", size = 47699480, upload-time = "2025-10-24T10:04:51.486Z" }, + { url = "https://files.pythonhosted.org/packages/95/e1/9dbe4c465c3365959d183e6345d0a8d1dc5b02ca3f8db4760b3bc834cf25/pyarrow-22.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8382ad21458075c2e66a82a29d650f963ce51c7708c7c0ff313a8c206c4fd5e8", size = 48011148, upload-time = "2025-10-24T10:04:59.585Z" }, + { url = "https://files.pythonhosted.org/packages/c5/b4/7caf5d21930061444c3cf4fa7535c82faf5263e22ce43af7c2759ceb5b8b/pyarrow-22.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1a812a5b727bc09c3d7ea072c4eebf657c2f7066155506ba31ebf4792f88f016", size = 50276964, upload-time = "2025-10-24T10:05:08.175Z" }, + { url = "https://files.pythonhosted.org/packages/ae/f3/cec89bd99fa3abf826f14d4e53d3d11340ce6f6af4d14bdcd54cd83b6576/pyarrow-22.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:ec5d40dd494882704fb876c16fa7261a69791e784ae34e6b5992e977bd2e238c", size = 28106517, upload-time = "2025-10-24T10:05:14.314Z" }, + { url = "https://files.pythonhosted.org/packages/af/63/ba23862d69652f85b615ca14ad14f3bcfc5bf1b99ef3f0cd04ff93fdad5a/pyarrow-22.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:bea79263d55c24a32b0d79c00a1c58bb2ee5f0757ed95656b01c0fb310c5af3d", size = 34211578, upload-time = "2025-10-24T10:05:21.583Z" }, + { url = "https://files.pythonhosted.org/packages/b1/d0/f9ad86fe809efd2bcc8be32032fa72e8b0d112b01ae56a053006376c5930/pyarrow-22.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:12fe549c9b10ac98c91cf791d2945e878875d95508e1a5d14091a7aaa66d9cf8", size = 35989906, upload-time = "2025-10-24T10:05:29.485Z" }, + { url = "https://files.pythonhosted.org/packages/b4/a8/f910afcb14630e64d673f15904ec27dd31f1e009b77033c365c84e8c1e1d/pyarrow-22.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:334f900ff08ce0423407af97e6c26ad5d4e3b0763645559ece6fbf3747d6a8f5", size = 45021677, upload-time = "2025-10-24T10:05:38.274Z" }, + { url = "https://files.pythonhosted.org/packages/13/95/aec81f781c75cd10554dc17a25849c720d54feafb6f7847690478dcf5ef8/pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c6c791b09c57ed76a18b03f2631753a4960eefbbca80f846da8baefc6491fcfe", size = 47726315, upload-time = "2025-10-24T10:05:47.314Z" }, + { url = "https://files.pythonhosted.org/packages/bb/d4/74ac9f7a54cfde12ee42734ea25d5a3c9a45db78f9def949307a92720d37/pyarrow-22.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c3200cb41cdbc65156e5f8c908d739b0dfed57e890329413da2748d1a2cd1a4e", size = 47990906, upload-time = "2025-10-24T10:05:58.254Z" }, + { url = "https://files.pythonhosted.org/packages/2e/71/fedf2499bf7a95062eafc989ace56572f3343432570e1c54e6599d5b88da/pyarrow-22.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ac93252226cf288753d8b46280f4edf3433bf9508b6977f8dd8526b521a1bbb9", size = 50306783, upload-time = "2025-10-24T10:06:08.08Z" }, + { url = "https://files.pythonhosted.org/packages/68/ed/b202abd5a5b78f519722f3d29063dda03c114711093c1995a33b8e2e0f4b/pyarrow-22.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:44729980b6c50a5f2bfcc2668d36c569ce17f8b17bccaf470c4313dcbbf13c9d", size = 27972883, upload-time = "2025-10-24T10:06:14.204Z" }, + { url = "https://files.pythonhosted.org/packages/a6/d6/d0fac16a2963002fc22c8fa75180a838737203d558f0ed3b564c4a54eef5/pyarrow-22.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e6e95176209257803a8b3d0394f21604e796dadb643d2f7ca21b66c9c0b30c9a", size = 34204629, upload-time = "2025-10-24T10:06:20.274Z" }, + { url = "https://files.pythonhosted.org/packages/c6/9c/1d6357347fbae062ad3f17082f9ebc29cc733321e892c0d2085f42a2212b/pyarrow-22.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:001ea83a58024818826a9e3f89bf9310a114f7e26dfe404a4c32686f97bd7901", size = 35985783, upload-time = "2025-10-24T10:06:27.301Z" }, + { url = "https://files.pythonhosted.org/packages/ff/c0/782344c2ce58afbea010150df07e3a2f5fdad299cd631697ae7bd3bac6e3/pyarrow-22.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ce20fe000754f477c8a9125543f1936ea5b8867c5406757c224d745ed033e691", size = 45020999, upload-time = "2025-10-24T10:06:35.387Z" }, + { url = "https://files.pythonhosted.org/packages/1b/8b/5362443737a5307a7b67c1017c42cd104213189b4970bf607e05faf9c525/pyarrow-22.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e0a15757fccb38c410947df156f9749ae4a3c89b2393741a50521f39a8cf202a", size = 47724601, upload-time = "2025-10-24T10:06:43.551Z" }, + { url = "https://files.pythonhosted.org/packages/69/4d/76e567a4fc2e190ee6072967cb4672b7d9249ac59ae65af2d7e3047afa3b/pyarrow-22.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cedb9dd9358e4ea1d9bce3665ce0797f6adf97ff142c8e25b46ba9cdd508e9b6", size = 48001050, upload-time = "2025-10-24T10:06:52.284Z" }, + { url = "https://files.pythonhosted.org/packages/01/5e/5653f0535d2a1aef8223cee9d92944cb6bccfee5cf1cd3f462d7cb022790/pyarrow-22.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:252be4a05f9d9185bb8c18e83764ebcfea7185076c07a7a662253af3a8c07941", size = 50307877, upload-time = "2025-10-24T10:07:02.405Z" }, + { url = "https://files.pythonhosted.org/packages/2d/f8/1d0bd75bf9328a3b826e24a16e5517cd7f9fbf8d34a3184a4566ef5a7f29/pyarrow-22.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:a4893d31e5ef780b6edcaf63122df0f8d321088bb0dee4c8c06eccb1ca28d145", size = 27977099, upload-time = "2025-10-24T10:08:07.259Z" }, + { url = "https://files.pythonhosted.org/packages/90/81/db56870c997805bf2b0f6eeeb2d68458bf4654652dccdcf1bf7a42d80903/pyarrow-22.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:f7fe3dbe871294ba70d789be16b6e7e52b418311e166e0e3cba9522f0f437fb1", size = 34336685, upload-time = "2025-10-24T10:07:11.47Z" }, + { url = "https://files.pythonhosted.org/packages/1c/98/0727947f199aba8a120f47dfc229eeb05df15bcd7a6f1b669e9f882afc58/pyarrow-22.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:ba95112d15fd4f1105fb2402c4eab9068f0554435e9b7085924bcfaac2cc306f", size = 36032158, upload-time = "2025-10-24T10:07:18.626Z" }, + { url = "https://files.pythonhosted.org/packages/96/b4/9babdef9c01720a0785945c7cf550e4acd0ebcd7bdd2e6f0aa7981fa85e2/pyarrow-22.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:c064e28361c05d72eed8e744c9605cbd6d2bb7481a511c74071fd9b24bc65d7d", size = 44892060, upload-time = "2025-10-24T10:07:26.002Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ca/2f8804edd6279f78a37062d813de3f16f29183874447ef6d1aadbb4efa0f/pyarrow-22.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6f9762274496c244d951c819348afbcf212714902742225f649cf02823a6a10f", size = 47504395, upload-time = "2025-10-24T10:07:34.09Z" }, + { url = "https://files.pythonhosted.org/packages/b9/f0/77aa5198fd3943682b2e4faaf179a674f0edea0d55d326d83cb2277d9363/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a9d9ffdc2ab696f6b15b4d1f7cec6658e1d788124418cb30030afbae31c64746", size = 48066216, upload-time = "2025-10-24T10:07:43.528Z" }, + { url = "https://files.pythonhosted.org/packages/79/87/a1937b6e78b2aff18b706d738c9e46ade5bfcf11b294e39c87706a0089ac/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ec1a15968a9d80da01e1d30349b2b0d7cc91e96588ee324ce1b5228175043e95", size = 50288552, upload-time = "2025-10-24T10:07:53.519Z" }, + { url = "https://files.pythonhosted.org/packages/60/ae/b5a5811e11f25788ccfdaa8f26b6791c9807119dffcf80514505527c384c/pyarrow-22.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:bba208d9c7decf9961998edf5c65e3ea4355d5818dd6cd0f6809bec1afb951cc", size = 28262504, upload-time = "2025-10-24T10:08:00.932Z" }, + { url = "https://files.pythonhosted.org/packages/bd/b0/0fa4d28a8edb42b0a7144edd20befd04173ac79819547216f8a9f36f9e50/pyarrow-22.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:9bddc2cade6561f6820d4cd73f99a0243532ad506bc510a75a5a65a522b2d74d", size = 34224062, upload-time = "2025-10-24T10:08:14.101Z" }, + { url = "https://files.pythonhosted.org/packages/0f/a8/7a719076b3c1be0acef56a07220c586f25cd24de0e3f3102b438d18ae5df/pyarrow-22.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e70ff90c64419709d38c8932ea9fe1cc98415c4f87ea8da81719e43f02534bc9", size = 35990057, upload-time = "2025-10-24T10:08:21.842Z" }, + { url = "https://files.pythonhosted.org/packages/89/3c/359ed54c93b47fb6fe30ed16cdf50e3f0e8b9ccfb11b86218c3619ae50a8/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:92843c305330aa94a36e706c16209cd4df274693e777ca47112617db7d0ef3d7", size = 45068002, upload-time = "2025-10-24T10:08:29.034Z" }, + { url = "https://files.pythonhosted.org/packages/55/fc/4945896cc8638536ee787a3bd6ce7cec8ec9acf452d78ec39ab328efa0a1/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:6dda1ddac033d27421c20d7a7943eec60be44e0db4e079f33cc5af3b8280ccde", size = 47737765, upload-time = "2025-10-24T10:08:38.559Z" }, + { url = "https://files.pythonhosted.org/packages/cd/5e/7cb7edeb2abfaa1f79b5d5eb89432356155c8426f75d3753cbcb9592c0fd/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:84378110dd9a6c06323b41b56e129c504d157d1a983ce8f5443761eb5256bafc", size = 48048139, upload-time = "2025-10-24T10:08:46.784Z" }, + { url = "https://files.pythonhosted.org/packages/88/c6/546baa7c48185f5e9d6e59277c4b19f30f48c94d9dd938c2a80d4d6b067c/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:854794239111d2b88b40b6ef92aa478024d1e5074f364033e73e21e3f76b25e0", size = 50314244, upload-time = "2025-10-24T10:08:55.771Z" }, + { url = "https://files.pythonhosted.org/packages/3c/79/755ff2d145aafec8d347bf18f95e4e81c00127f06d080135dfc86aea417c/pyarrow-22.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:b883fe6fd85adad7932b3271c38ac289c65b7337c2c132e9569f9d3940620730", size = 28757501, upload-time = "2025-10-24T10:09:59.891Z" }, + { url = "https://files.pythonhosted.org/packages/0e/d2/237d75ac28ced3147912954e3c1a174df43a95f4f88e467809118a8165e0/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:7a820d8ae11facf32585507c11f04e3f38343c1e784c9b5a8b1da5c930547fe2", size = 34355506, upload-time = "2025-10-24T10:09:02.953Z" }, + { url = "https://files.pythonhosted.org/packages/1e/2c/733dfffe6d3069740f98e57ff81007809067d68626c5faef293434d11bd6/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:c6ec3675d98915bf1ec8b3c7986422682f7232ea76cad276f4c8abd5b7319b70", size = 36047312, upload-time = "2025-10-24T10:09:10.334Z" }, + { url = "https://files.pythonhosted.org/packages/7c/2b/29d6e3782dc1f299727462c1543af357a0f2c1d3c160ce199950d9ca51eb/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:3e739edd001b04f654b166204fc7a9de896cf6007eaff33409ee9e50ceaff754", size = 45081609, upload-time = "2025-10-24T10:09:18.61Z" }, + { url = "https://files.pythonhosted.org/packages/8d/42/aa9355ecc05997915af1b7b947a7f66c02dcaa927f3203b87871c114ba10/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:7388ac685cab5b279a41dfe0a6ccd99e4dbf322edfb63e02fc0443bf24134e91", size = 47703663, upload-time = "2025-10-24T10:09:27.369Z" }, + { url = "https://files.pythonhosted.org/packages/ee/62/45abedde480168e83a1de005b7b7043fd553321c1e8c5a9a114425f64842/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f633074f36dbc33d5c05b5dc75371e5660f1dbf9c8b1d95669def05e5425989c", size = 48066543, upload-time = "2025-10-24T10:09:34.908Z" }, + { url = "https://files.pythonhosted.org/packages/84/e9/7878940a5b072e4f3bf998770acafeae13b267f9893af5f6d4ab3904b67e/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4c19236ae2402a8663a2c8f21f1870a03cc57f0bef7e4b6eb3238cc82944de80", size = 50288838, upload-time = "2025-10-24T10:09:44.394Z" }, + { url = "https://files.pythonhosted.org/packages/7b/03/f335d6c52b4a4761bcc83499789a1e2e16d9d201a58c327a9b5cc9a41bd9/pyarrow-22.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0c34fe18094686194f204a3b1787a27456897d8a2d62caf84b61e8dfbc0252ae", size = 29185594, upload-time = "2025-10-24T10:09:53.111Z" }, ] [[package]] @@ -1412,6 +1784,54 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/53/97/d2cbbaa10c9b826af0e10fdf836e1bf344d9f0abb873ebc34d1f49642d3f/roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c", size = 7742, upload-time = "2025-02-22T07:34:52.422Z" }, ] +[[package]] +name = "ruff" +version = "0.14.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/75/62/50b7727004dfe361104dfbf898c45a9a2fdfad8c72c04ae62900224d6ecf/ruff-0.14.3.tar.gz", hash = "sha256:4ff876d2ab2b161b6de0aa1f5bd714e8e9b4033dc122ee006925fbacc4f62153", size = 5558687, upload-time = "2025-10-31T00:26:26.878Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/8e/0c10ff1ea5d4360ab8bfca4cb2c9d979101a391f3e79d2616c9bf348cd26/ruff-0.14.3-py3-none-linux_armv6l.whl", hash = "sha256:876b21e6c824f519446715c1342b8e60f97f93264012de9d8d10314f8a79c371", size = 12535613, upload-time = "2025-10-31T00:25:44.302Z" }, + { url = "https://files.pythonhosted.org/packages/d3/c8/6724f4634c1daf52409fbf13fefda64aa9c8f81e44727a378b7b73dc590b/ruff-0.14.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b6fd8c79b457bedd2abf2702b9b472147cd860ed7855c73a5247fa55c9117654", size = 12855812, upload-time = "2025-10-31T00:25:47.793Z" }, + { url = "https://files.pythonhosted.org/packages/de/03/db1bce591d55fd5f8a08bb02517fa0b5097b2ccabd4ea1ee29aa72b67d96/ruff-0.14.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:71ff6edca490c308f083156938c0c1a66907151263c4abdcb588602c6e696a14", size = 11944026, upload-time = "2025-10-31T00:25:49.657Z" }, + { url = "https://files.pythonhosted.org/packages/0b/75/4f8dbd48e03272715d12c87dc4fcaaf21b913f0affa5f12a4e9c6f8a0582/ruff-0.14.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:786ee3ce6139772ff9272aaf43296d975c0217ee1b97538a98171bf0d21f87ed", size = 12356818, upload-time = "2025-10-31T00:25:51.949Z" }, + { url = "https://files.pythonhosted.org/packages/ec/9b/506ec5b140c11d44a9a4f284ea7c14ebf6f8b01e6e8917734a3325bff787/ruff-0.14.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cd6291d0061811c52b8e392f946889916757610d45d004e41140d81fb6cd5ddc", size = 12336745, upload-time = "2025-10-31T00:25:54.248Z" }, + { url = "https://files.pythonhosted.org/packages/c7/e1/c560d254048c147f35e7f8131d30bc1f63a008ac61595cf3078a3e93533d/ruff-0.14.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a497ec0c3d2c88561b6d90f9c29f5ae68221ac00d471f306fa21fa4264ce5fcd", size = 13101684, upload-time = "2025-10-31T00:25:56.253Z" }, + { url = "https://files.pythonhosted.org/packages/a5/32/e310133f8af5cd11f8cc30f52522a3ebccc5ea5bff4b492f94faceaca7a8/ruff-0.14.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:e231e1be58fc568950a04fbe6887c8e4b85310e7889727e2b81db205c45059eb", size = 14535000, upload-time = "2025-10-31T00:25:58.397Z" }, + { url = "https://files.pythonhosted.org/packages/a2/a1/7b0470a22158c6d8501eabc5e9b6043c99bede40fa1994cadf6b5c2a61c7/ruff-0.14.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:469e35872a09c0e45fecf48dd960bfbce056b5db2d5e6b50eca329b4f853ae20", size = 14156450, upload-time = "2025-10-31T00:26:00.889Z" }, + { url = "https://files.pythonhosted.org/packages/0a/96/24bfd9d1a7f532b560dcee1a87096332e461354d3882124219bcaff65c09/ruff-0.14.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d6bc90307c469cb9d28b7cfad90aaa600b10d67c6e22026869f585e1e8a2db0", size = 13568414, upload-time = "2025-10-31T00:26:03.291Z" }, + { url = "https://files.pythonhosted.org/packages/a7/e7/138b883f0dfe4ad5b76b58bf4ae675f4d2176ac2b24bdd81b4d966b28c61/ruff-0.14.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2f8a0bbcffcfd895df39c9a4ecd59bb80dca03dc43f7fb63e647ed176b741e", size = 13315293, upload-time = "2025-10-31T00:26:05.708Z" }, + { url = "https://files.pythonhosted.org/packages/33/f4/c09bb898be97b2eb18476b7c950df8815ef14cf956074177e9fbd40b7719/ruff-0.14.3-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:678fdd7c7d2d94851597c23ee6336d25f9930b460b55f8598e011b57c74fd8c5", size = 13539444, upload-time = "2025-10-31T00:26:08.09Z" }, + { url = "https://files.pythonhosted.org/packages/9c/aa/b30a1db25fc6128b1dd6ff0741fa4abf969ded161599d07ca7edd0739cc0/ruff-0.14.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:1ec1ac071e7e37e0221d2f2dbaf90897a988c531a8592a6a5959f0603a1ecf5e", size = 12252581, upload-time = "2025-10-31T00:26:10.297Z" }, + { url = "https://files.pythonhosted.org/packages/da/13/21096308f384d796ffe3f2960b17054110a9c3828d223ca540c2b7cc670b/ruff-0.14.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:afcdc4b5335ef440d19e7df9e8ae2ad9f749352190e96d481dc501b753f0733e", size = 12307503, upload-time = "2025-10-31T00:26:12.646Z" }, + { url = "https://files.pythonhosted.org/packages/cb/cc/a350bac23f03b7dbcde3c81b154706e80c6f16b06ff1ce28ed07dc7b07b0/ruff-0.14.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:7bfc42f81862749a7136267a343990f865e71fe2f99cf8d2958f684d23ce3dfa", size = 12675457, upload-time = "2025-10-31T00:26:15.044Z" }, + { url = "https://files.pythonhosted.org/packages/cb/76/46346029fa2f2078826bc88ef7167e8c198e58fe3126636e52f77488cbba/ruff-0.14.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:a65e448cfd7e9c59fae8cf37f9221585d3354febaad9a07f29158af1528e165f", size = 13403980, upload-time = "2025-10-31T00:26:17.81Z" }, + { url = "https://files.pythonhosted.org/packages/9f/a4/35f1ef68c4e7b236d4a5204e3669efdeefaef21f0ff6a456792b3d8be438/ruff-0.14.3-py3-none-win32.whl", hash = "sha256:f3d91857d023ba93e14ed2d462ab62c3428f9bbf2b4fbac50a03ca66d31991f7", size = 12500045, upload-time = "2025-10-31T00:26:20.503Z" }, + { url = "https://files.pythonhosted.org/packages/03/15/51960ae340823c9859fb60c63301d977308735403e2134e17d1d2858c7fb/ruff-0.14.3-py3-none-win_amd64.whl", hash = "sha256:d7b7006ac0756306db212fd37116cce2bd307e1e109375e1c6c106002df0ae5f", size = 13594005, upload-time = "2025-10-31T00:26:22.533Z" }, + { url = "https://files.pythonhosted.org/packages/b7/73/4de6579bac8e979fca0a77e54dec1f1e011a0d268165eb8a9bc0982a6564/ruff-0.14.3-py3-none-win_arm64.whl", hash = "sha256:26eb477ede6d399d898791d01961e16b86f02bc2486d0d1a7a9bb2379d055dc1", size = 12590017, upload-time = "2025-10-31T00:26:24.52Z" }, +] + +[[package]] +name = "safetensors" +version = "0.6.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ac/cc/738f3011628920e027a11754d9cae9abec1aed00f7ae860abbf843755233/safetensors-0.6.2.tar.gz", hash = "sha256:43ff2aa0e6fa2dc3ea5524ac7ad93a9839256b8703761e76e2d0b2a3fa4f15d9", size = 197968, upload-time = "2025-08-08T13:13:58.654Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/b1/3f5fd73c039fc87dba3ff8b5d528bfc5a32b597fea8e7a6a4800343a17c7/safetensors-0.6.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9c85ede8ec58f120bad982ec47746981e210492a6db876882aa021446af8ffba", size = 454797, upload-time = "2025-08-08T13:13:52.066Z" }, + { url = "https://files.pythonhosted.org/packages/8c/c9/bb114c158540ee17907ec470d01980957fdaf87b4aa07914c24eba87b9c6/safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d6675cf4b39c98dbd7d940598028f3742e0375a6b4d4277e76beb0c35f4b843b", size = 432206, upload-time = "2025-08-08T13:13:50.931Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8e/f70c34e47df3110e8e0bb268d90db8d4be8958a54ab0336c9be4fe86dac8/safetensors-0.6.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d2d2b3ce1e2509c68932ca03ab8f20570920cd9754b05063d4368ee52833ecd", size = 473261, upload-time = "2025-08-08T13:13:41.259Z" }, + { url = "https://files.pythonhosted.org/packages/2a/f5/be9c6a7c7ef773e1996dc214e73485286df1836dbd063e8085ee1976f9cb/safetensors-0.6.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:93de35a18f46b0f5a6a1f9e26d91b442094f2df02e9fd7acf224cfec4238821a", size = 485117, upload-time = "2025-08-08T13:13:43.506Z" }, + { url = "https://files.pythonhosted.org/packages/c9/55/23f2d0a2c96ed8665bf17a30ab4ce5270413f4d74b6d87dd663258b9af31/safetensors-0.6.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89a89b505f335640f9120fac65ddeb83e40f1fd081cb8ed88b505bdccec8d0a1", size = 616154, upload-time = "2025-08-08T13:13:45.096Z" }, + { url = "https://files.pythonhosted.org/packages/98/c6/affb0bd9ce02aa46e7acddbe087912a04d953d7a4d74b708c91b5806ef3f/safetensors-0.6.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc4d0d0b937e04bdf2ae6f70cd3ad51328635fe0e6214aa1fc811f3b576b3bda", size = 520713, upload-time = "2025-08-08T13:13:46.25Z" }, + { url = "https://files.pythonhosted.org/packages/fe/5d/5a514d7b88e310c8b146e2404e0dc161282e78634d9358975fd56dfd14be/safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8045db2c872db8f4cbe3faa0495932d89c38c899c603f21e9b6486951a5ecb8f", size = 485835, upload-time = "2025-08-08T13:13:49.373Z" }, + { url = "https://files.pythonhosted.org/packages/7a/7b/4fc3b2ba62c352b2071bea9cfbad330fadda70579f617506ae1a2f129cab/safetensors-0.6.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:81e67e8bab9878bb568cffbc5f5e655adb38d2418351dc0859ccac158f753e19", size = 521503, upload-time = "2025-08-08T13:13:47.651Z" }, + { url = "https://files.pythonhosted.org/packages/5a/50/0057e11fe1f3cead9254315a6c106a16dd4b1a19cd247f7cc6414f6b7866/safetensors-0.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b0e4d029ab0a0e0e4fdf142b194514695b1d7d3735503ba700cf36d0fc7136ce", size = 652256, upload-time = "2025-08-08T13:13:53.167Z" }, + { url = "https://files.pythonhosted.org/packages/e9/29/473f789e4ac242593ac1656fbece6e1ecd860bb289e635e963667807afe3/safetensors-0.6.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:fa48268185c52bfe8771e46325a1e21d317207bcabcb72e65c6e28e9ffeb29c7", size = 747281, upload-time = "2025-08-08T13:13:54.656Z" }, + { url = "https://files.pythonhosted.org/packages/68/52/f7324aad7f2df99e05525c84d352dc217e0fa637a4f603e9f2eedfbe2c67/safetensors-0.6.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:d83c20c12c2d2f465997c51b7ecb00e407e5f94d7dec3ea0cc11d86f60d3fde5", size = 692286, upload-time = "2025-08-08T13:13:55.884Z" }, + { url = "https://files.pythonhosted.org/packages/ad/fe/cad1d9762868c7c5dc70c8620074df28ebb1a8e4c17d4c0cb031889c457e/safetensors-0.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d944cea65fad0ead848b6ec2c37cc0b197194bec228f8020054742190e9312ac", size = 655957, upload-time = "2025-08-08T13:13:57.029Z" }, + { url = "https://files.pythonhosted.org/packages/59/a7/e2158e17bbe57d104f0abbd95dff60dda916cf277c9f9663b4bf9bad8b6e/safetensors-0.6.2-cp38-abi3-win32.whl", hash = "sha256:cab75ca7c064d3911411461151cb69380c9225798a20e712b102edda2542ddb1", size = 308926, upload-time = "2025-08-08T13:14:01.095Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c3/c0be1135726618dc1e28d181b8c442403d8dbb9e273fd791de2d4384bcdd/safetensors-0.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:c7b214870df923cbc1593c3faee16bec59ea462758699bd3fee399d00aac072c", size = 320192, upload-time = "2025-08-08T13:13:59.467Z" }, +] + [[package]] name = "scikit-learn" version = "1.6.1" @@ -1510,6 +1930,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + [[package]] name = "snowballstemmer" version = "3.0.1" @@ -1676,6 +2105,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" }, ] +[[package]] +name = "stack-data" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asttokens" }, + { name = "executing" }, + { name = "pure-eval" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, +] + [[package]] name = "sympy" version = "1.14.0" @@ -1697,6 +2140,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, ] +[[package]] +name = "tokenizers" +version = "0.22.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1c/46/fb6854cec3278fbfa4a75b50232c77622bc517ac886156e6afbfa4d8fc6e/tokenizers-0.22.1.tar.gz", hash = "sha256:61de6522785310a309b3407bac22d99c4db5dba349935e99e4d15ea2226af2d9", size = 363123, upload-time = "2025-09-19T09:49:23.424Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bf/33/f4b2d94ada7ab297328fc671fed209368ddb82f965ec2224eb1892674c3a/tokenizers-0.22.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:59fdb013df17455e5f950b4b834a7b3ee2e0271e6378ccb33aa74d178b513c73", size = 3069318, upload-time = "2025-09-19T09:49:11.848Z" }, + { url = "https://files.pythonhosted.org/packages/1c/58/2aa8c874d02b974990e89ff95826a4852a8b2a273c7d1b4411cdd45a4565/tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8d4e484f7b0827021ac5f9f71d4794aaef62b979ab7608593da22b1d2e3c4edc", size = 2926478, upload-time = "2025-09-19T09:49:09.759Z" }, + { url = "https://files.pythonhosted.org/packages/1e/3b/55e64befa1e7bfea963cf4b787b2cea1011362c4193f5477047532ce127e/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19d2962dd28bc67c1f205ab180578a78eef89ac60ca7ef7cbe9635a46a56422a", size = 3256994, upload-time = "2025-09-19T09:48:56.701Z" }, + { url = "https://files.pythonhosted.org/packages/71/0b/fbfecf42f67d9b7b80fde4aabb2b3110a97fac6585c9470b5bff103a80cb/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38201f15cdb1f8a6843e6563e6e79f4abd053394992b9bbdf5213ea3469b4ae7", size = 3153141, upload-time = "2025-09-19T09:48:59.749Z" }, + { url = "https://files.pythonhosted.org/packages/17/a9/b38f4e74e0817af8f8ef925507c63c6ae8171e3c4cb2d5d4624bf58fca69/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1cbe5454c9a15df1b3443c726063d930c16f047a3cc724b9e6e1a91140e5a21", size = 3508049, upload-time = "2025-09-19T09:49:05.868Z" }, + { url = "https://files.pythonhosted.org/packages/d2/48/dd2b3dac46bb9134a88e35d72e1aa4869579eacc1a27238f1577270773ff/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7d094ae6312d69cc2a872b54b91b309f4f6fbce871ef28eb27b52a98e4d0214", size = 3710730, upload-time = "2025-09-19T09:49:01.832Z" }, + { url = "https://files.pythonhosted.org/packages/93/0e/ccabc8d16ae4ba84a55d41345207c1e2ea88784651a5a487547d80851398/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afd7594a56656ace95cdd6df4cca2e4059d294c5cfb1679c57824b605556cb2f", size = 3412560, upload-time = "2025-09-19T09:49:03.867Z" }, + { url = "https://files.pythonhosted.org/packages/d0/c6/dc3a0db5a6766416c32c034286d7c2d406da1f498e4de04ab1b8959edd00/tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2ef6063d7a84994129732b47e7915e8710f27f99f3a3260b8a38fc7ccd083f4", size = 3250221, upload-time = "2025-09-19T09:49:07.664Z" }, + { url = "https://files.pythonhosted.org/packages/d7/a6/2c8486eef79671601ff57b093889a345dd3d576713ef047776015dc66de7/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ba0a64f450b9ef412c98f6bcd2a50c6df6e2443b560024a09fa6a03189726879", size = 9345569, upload-time = "2025-09-19T09:49:14.214Z" }, + { url = "https://files.pythonhosted.org/packages/6b/16/32ce667f14c35537f5f605fe9bea3e415ea1b0a646389d2295ec348d5657/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:331d6d149fa9c7d632cde4490fb8bbb12337fa3a0232e77892be656464f4b446", size = 9271599, upload-time = "2025-09-19T09:49:16.639Z" }, + { url = "https://files.pythonhosted.org/packages/51/7c/a5f7898a3f6baa3fc2685c705e04c98c1094c523051c805cdd9306b8f87e/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:607989f2ea68a46cb1dfbaf3e3aabdf3f21d8748312dbeb6263d1b3b66c5010a", size = 9533862, upload-time = "2025-09-19T09:49:19.146Z" }, + { url = "https://files.pythonhosted.org/packages/36/65/7e75caea90bc73c1dd8d40438adf1a7bc26af3b8d0a6705ea190462506e1/tokenizers-0.22.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a0f307d490295717726598ef6fa4f24af9d484809223bbc253b201c740a06390", size = 9681250, upload-time = "2025-09-19T09:49:21.501Z" }, + { url = "https://files.pythonhosted.org/packages/30/2c/959dddef581b46e6209da82df3b78471e96260e2bc463f89d23b1bf0e52a/tokenizers-0.22.1-cp39-abi3-win32.whl", hash = "sha256:b5120eed1442765cd90b903bb6cfef781fd8fe64e34ccaecbae4c619b7b12a82", size = 2472003, upload-time = "2025-09-19T09:49:27.089Z" }, + { url = "https://files.pythonhosted.org/packages/b3/46/e33a8c93907b631a99377ef4c5f817ab453d0b34f93529421f42ff559671/tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138", size = 2674684, upload-time = "2025-09-19T09:49:24.953Z" }, +] + [[package]] name = "torch" version = "2.7.0" @@ -1774,6 +2242,11 @@ explainability = [ { name = "nltk" }, { name = "unidecode" }, ] +huggingface = [ + { name = "datasets" }, + { name = "tokenizers" }, + { name = "transformers" }, +] preprocess = [ { name = "nltk" }, { name = "unidecode" }, @@ -1782,10 +2255,13 @@ preprocess = [ [package.dev-dependencies] dev = [ { name = "captum" }, + { name = "ipywidgets" }, { name = "nltk" }, { name = "pandas" }, + { name = "pre-commit" }, { name = "pyarrow" }, { name = "pytest" }, + { name = "ruff" }, { name = "scikit-learn" }, { name = "unidecode" }, ] @@ -1802,22 +2278,28 @@ docs = [ [package.metadata] requires-dist = [ { name = "captum", marker = "extra == 'explainability'" }, + { name = "datasets", marker = "extra == 'huggingface'", specifier = ">=4.3.0" }, { name = "nltk", marker = "extra == 'explainability'" }, { name = "nltk", marker = "extra == 'preprocess'" }, { name = "numpy", specifier = ">=1.26.4" }, { name = "pytorch-lightning", specifier = ">=2.4.0" }, + { name = "tokenizers", marker = "extra == 'huggingface'", specifier = ">=0.22.1" }, + { name = "transformers", marker = "extra == 'huggingface'", specifier = ">=4.57.1" }, { name = "unidecode", marker = "extra == 'explainability'" }, { name = "unidecode", marker = "extra == 'preprocess'" }, ] -provides-extras = ["explainability", "preprocess"] +provides-extras = ["explainability", "preprocess", "huggingface"] [package.metadata.requires-dev] dev = [ { name = "captum" }, + { name = "ipywidgets", specifier = ">=8.1.8" }, { name = "nltk" }, { name = "pandas" }, + { name = "pre-commit", specifier = ">=4.3.0" }, { name = "pyarrow" }, { name = "pytest", specifier = ">=8.1.1,<9" }, + { name = "ruff", specifier = ">=0.14.3" }, { name = "scikit-learn" }, { name = "unidecode" }, ] @@ -1843,6 +2325,36 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, ] +[[package]] +name = "traitlets" +version = "5.14.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, +] + +[[package]] +name = "transformers" +version = "4.57.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/68/a39307bcc4116a30b2106f2e689130a48de8bd8a1e635b5e1030e46fcd9e/transformers-4.57.1.tar.gz", hash = "sha256:f06c837959196c75039809636cd964b959f6604b75b8eeec6fdfc0440b89cc55", size = 10142511, upload-time = "2025-10-14T15:39:26.18Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925, upload-time = "2025-10-14T15:39:23.085Z" }, +] + [[package]] name = "triton" version = "3.3.0" @@ -1893,6 +2405,141 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, ] +[[package]] +name = "virtualenv" +version = "20.35.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "distlib" }, + { name = "filelock" }, + { name = "platformdirs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a4/d5/b0ccd381d55c8f45d46f77df6ae59fbc23d19e901e2d523395598e5f4c93/virtualenv-20.35.3.tar.gz", hash = "sha256:4f1a845d131133bdff10590489610c98c168ff99dc75d6c96853801f7f67af44", size = 6002907, upload-time = "2025-10-10T21:23:33.178Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/73/d9a94da0e9d470a543c1b9d3ccbceb0f59455983088e727b8a1824ed90fb/virtualenv-20.35.3-py3-none-any.whl", hash = "sha256:63d106565078d8c8d0b206d48080f938a8b25361e19432d2c9db40d2899c810a", size = 5981061, upload-time = "2025-10-10T21:23:30.433Z" }, +] + +[[package]] +name = "wcwidth" +version = "0.2.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/30/6b0809f4510673dc723187aeaf24c7f5459922d01e2f794277a3dfb90345/wcwidth-0.2.14.tar.gz", hash = "sha256:4d478375d31bc5395a3c55c40ccdf3354688364cd61c4f6adacaa9215d0b3605", size = 102293, upload-time = "2025-09-22T16:29:53.023Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/b5/123f13c975e9f27ab9c0770f514345bd406d0e8d3b7a0723af9d43f710af/wcwidth-0.2.14-py2.py3-none-any.whl", hash = "sha256:a7bb560c8aee30f9957e5f9895805edd20602f2d7f720186dfd906e82b4982e1", size = 37286, upload-time = "2025-09-22T16:29:51.641Z" }, +] + +[[package]] +name = "widgetsnbextension" +version = "4.0.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bd/f4/c67440c7fb409a71b7404b7aefcd7569a9c0d6bd071299bf4198ae7a5d95/widgetsnbextension-4.0.15.tar.gz", hash = "sha256:de8610639996f1567952d763a5a41af8af37f2575a41f9852a38f947eb82a3b9", size = 1097402, upload-time = "2025-11-01T21:15:55.178Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3f/0e/fa3b193432cfc60c93b42f3be03365f5f909d2b3ea410295cf36df739e31/widgetsnbextension-4.0.15-py3-none-any.whl", hash = "sha256:8156704e4346a571d9ce73b84bee86a29906c9abfd7223b7228a28899ccf3366", size = 2196503, upload-time = "2025-11-01T21:15:53.565Z" }, +] + +[[package]] +name = "xxhash" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/02/84/30869e01909fb37a6cc7e18688ee8bf1e42d57e7e0777636bd47524c43c7/xxhash-3.6.0.tar.gz", hash = "sha256:f0162a78b13a0d7617b2845b90c763339d1f1d82bb04a4b07f4ab535cc5e05d6", size = 85160, upload-time = "2025-10-02T14:37:08.097Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/d4/cc2f0400e9154df4b9964249da78ebd72f318e35ccc425e9f403c392f22a/xxhash-3.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b47bbd8cf2d72797f3c2772eaaac0ded3d3af26481a26d7d7d41dc2d3c46b04a", size = 32844, upload-time = "2025-10-02T14:34:14.037Z" }, + { url = "https://files.pythonhosted.org/packages/5e/ec/1cc11cd13e26ea8bc3cb4af4eaadd8d46d5014aebb67be3f71fb0b68802a/xxhash-3.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2b6821e94346f96db75abaa6e255706fb06ebd530899ed76d32cd99f20dc52fa", size = 30809, upload-time = "2025-10-02T14:34:15.484Z" }, + { url = "https://files.pythonhosted.org/packages/04/5f/19fe357ea348d98ca22f456f75a30ac0916b51c753e1f8b2e0e6fb884cce/xxhash-3.6.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d0a9751f71a1a65ce3584e9cae4467651c7e70c9d31017fa57574583a4540248", size = 194665, upload-time = "2025-10-02T14:34:16.541Z" }, + { url = "https://files.pythonhosted.org/packages/90/3b/d1f1a8f5442a5fd8beedae110c5af7604dc37349a8e16519c13c19a9a2de/xxhash-3.6.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b29ee68625ab37b04c0b40c3fafdf24d2f75ccd778333cfb698f65f6c463f62", size = 213550, upload-time = "2025-10-02T14:34:17.878Z" }, + { url = "https://files.pythonhosted.org/packages/c4/ef/3a9b05eb527457d5db13a135a2ae1a26c80fecd624d20f3e8dcc4cb170f3/xxhash-3.6.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6812c25fe0d6c36a46ccb002f40f27ac903bf18af9f6dd8f9669cb4d176ab18f", size = 212384, upload-time = "2025-10-02T14:34:19.182Z" }, + { url = "https://files.pythonhosted.org/packages/0f/18/ccc194ee698c6c623acbf0f8c2969811a8a4b6185af5e824cd27b9e4fd3e/xxhash-3.6.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4ccbff013972390b51a18ef1255ef5ac125c92dc9143b2d1909f59abc765540e", size = 445749, upload-time = "2025-10-02T14:34:20.659Z" }, + { url = "https://files.pythonhosted.org/packages/a5/86/cf2c0321dc3940a7aa73076f4fd677a0fb3e405cb297ead7d864fd90847e/xxhash-3.6.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:297b7fbf86c82c550e12e8fb71968b3f033d27b874276ba3624ea868c11165a8", size = 193880, upload-time = "2025-10-02T14:34:22.431Z" }, + { url = "https://files.pythonhosted.org/packages/82/fb/96213c8560e6f948a1ecc9a7613f8032b19ee45f747f4fca4eb31bb6d6ed/xxhash-3.6.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dea26ae1eb293db089798d3973a5fc928a18fdd97cc8801226fae705b02b14b0", size = 210912, upload-time = "2025-10-02T14:34:23.937Z" }, + { url = "https://files.pythonhosted.org/packages/40/aa/4395e669b0606a096d6788f40dbdf2b819d6773aa290c19e6e83cbfc312f/xxhash-3.6.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7a0b169aafb98f4284f73635a8e93f0735f9cbde17bd5ec332480484241aaa77", size = 198654, upload-time = "2025-10-02T14:34:25.644Z" }, + { url = "https://files.pythonhosted.org/packages/67/74/b044fcd6b3d89e9b1b665924d85d3f400636c23590226feb1eb09e1176ce/xxhash-3.6.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:08d45aef063a4531b785cd72de4887766d01dc8f362a515693df349fdb825e0c", size = 210867, upload-time = "2025-10-02T14:34:27.203Z" }, + { url = "https://files.pythonhosted.org/packages/bc/fd/3ce73bf753b08cb19daee1eb14aa0d7fe331f8da9c02dd95316ddfe5275e/xxhash-3.6.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:929142361a48ee07f09121fe9e96a84950e8d4df3bb298ca5d88061969f34d7b", size = 414012, upload-time = "2025-10-02T14:34:28.409Z" }, + { url = "https://files.pythonhosted.org/packages/ba/b3/5a4241309217c5c876f156b10778f3ab3af7ba7e3259e6d5f5c7d0129eb2/xxhash-3.6.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:51312c768403d8540487dbbfb557454cfc55589bbde6424456951f7fcd4facb3", size = 191409, upload-time = "2025-10-02T14:34:29.696Z" }, + { url = "https://files.pythonhosted.org/packages/c0/01/99bfbc15fb9abb9a72b088c1d95219fc4782b7d01fc835bd5744d66dd0b8/xxhash-3.6.0-cp311-cp311-win32.whl", hash = "sha256:d1927a69feddc24c987b337ce81ac15c4720955b667fe9b588e02254b80446fd", size = 30574, upload-time = "2025-10-02T14:34:31.028Z" }, + { url = "https://files.pythonhosted.org/packages/65/79/9d24d7f53819fe301b231044ea362ce64e86c74f6e8c8e51320de248b3e5/xxhash-3.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:26734cdc2d4ffe449b41d186bbeac416f704a482ed835d375a5c0cb02bc63fef", size = 31481, upload-time = "2025-10-02T14:34:32.062Z" }, + { url = "https://files.pythonhosted.org/packages/30/4e/15cd0e3e8772071344eab2961ce83f6e485111fed8beb491a3f1ce100270/xxhash-3.6.0-cp311-cp311-win_arm64.whl", hash = "sha256:d72f67ef8bf36e05f5b6c65e8524f265bd61071471cd4cf1d36743ebeeeb06b7", size = 27861, upload-time = "2025-10-02T14:34:33.555Z" }, + { url = "https://files.pythonhosted.org/packages/9a/07/d9412f3d7d462347e4511181dea65e47e0d0e16e26fbee2ea86a2aefb657/xxhash-3.6.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:01362c4331775398e7bb34e3ab403bc9ee9f7c497bc7dee6272114055277dd3c", size = 32744, upload-time = "2025-10-02T14:34:34.622Z" }, + { url = "https://files.pythonhosted.org/packages/79/35/0429ee11d035fc33abe32dca1b2b69e8c18d236547b9a9b72c1929189b9a/xxhash-3.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b7b2df81a23f8cb99656378e72501b2cb41b1827c0f5a86f87d6b06b69f9f204", size = 30816, upload-time = "2025-10-02T14:34:36.043Z" }, + { url = "https://files.pythonhosted.org/packages/b7/f2/57eb99aa0f7d98624c0932c5b9a170e1806406cdbcdb510546634a1359e0/xxhash-3.6.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:dc94790144e66b14f67b10ac8ed75b39ca47536bf8800eb7c24b50271ea0c490", size = 194035, upload-time = "2025-10-02T14:34:37.354Z" }, + { url = "https://files.pythonhosted.org/packages/4c/ed/6224ba353690d73af7a3f1c7cdb1fc1b002e38f783cb991ae338e1eb3d79/xxhash-3.6.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:93f107c673bccf0d592cdba077dedaf52fe7f42dcd7676eba1f6d6f0c3efffd2", size = 212914, upload-time = "2025-10-02T14:34:38.6Z" }, + { url = "https://files.pythonhosted.org/packages/38/86/fb6b6130d8dd6b8942cc17ab4d90e223653a89aa32ad2776f8af7064ed13/xxhash-3.6.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2aa5ee3444c25b69813663c9f8067dcfaa2e126dc55e8dddf40f4d1c25d7effa", size = 212163, upload-time = "2025-10-02T14:34:39.872Z" }, + { url = "https://files.pythonhosted.org/packages/ee/dc/e84875682b0593e884ad73b2d40767b5790d417bde603cceb6878901d647/xxhash-3.6.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f7f99123f0e1194fa59cc69ad46dbae2e07becec5df50a0509a808f90a0f03f0", size = 445411, upload-time = "2025-10-02T14:34:41.569Z" }, + { url = "https://files.pythonhosted.org/packages/11/4f/426f91b96701ec2f37bb2b8cec664eff4f658a11f3fa9d94f0a887ea6d2b/xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:49e03e6fe2cac4a1bc64952dd250cf0dbc5ef4ebb7b8d96bce82e2de163c82a2", size = 193883, upload-time = "2025-10-02T14:34:43.249Z" }, + { url = "https://files.pythonhosted.org/packages/53/5a/ddbb83eee8e28b778eacfc5a85c969673e4023cdeedcfcef61f36731610b/xxhash-3.6.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bd17fede52a17a4f9a7bc4472a5867cb0b160deeb431795c0e4abe158bc784e9", size = 210392, upload-time = "2025-10-02T14:34:45.042Z" }, + { url = "https://files.pythonhosted.org/packages/1e/c2/ff69efd07c8c074ccdf0a4f36fcdd3d27363665bcdf4ba399abebe643465/xxhash-3.6.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6fb5f5476bef678f69db04f2bd1efbed3030d2aba305b0fc1773645f187d6a4e", size = 197898, upload-time = "2025-10-02T14:34:46.302Z" }, + { url = "https://files.pythonhosted.org/packages/58/ca/faa05ac19b3b622c7c9317ac3e23954187516298a091eb02c976d0d3dd45/xxhash-3.6.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:843b52f6d88071f87eba1631b684fcb4b2068cd2180a0224122fe4ef011a9374", size = 210655, upload-time = "2025-10-02T14:34:47.571Z" }, + { url = "https://files.pythonhosted.org/packages/d4/7a/06aa7482345480cc0cb597f5c875b11a82c3953f534394f620b0be2f700c/xxhash-3.6.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7d14a6cfaf03b1b6f5f9790f76880601ccc7896aff7ab9cd8978a939c1eb7e0d", size = 414001, upload-time = "2025-10-02T14:34:49.273Z" }, + { url = "https://files.pythonhosted.org/packages/23/07/63ffb386cd47029aa2916b3d2f454e6cc5b9f5c5ada3790377d5430084e7/xxhash-3.6.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:418daf3db71e1413cfe211c2f9a528456936645c17f46b5204705581a45390ae", size = 191431, upload-time = "2025-10-02T14:34:50.798Z" }, + { url = "https://files.pythonhosted.org/packages/0f/93/14fde614cadb4ddf5e7cebf8918b7e8fac5ae7861c1875964f17e678205c/xxhash-3.6.0-cp312-cp312-win32.whl", hash = "sha256:50fc255f39428a27299c20e280d6193d8b63b8ef8028995323bf834a026b4fbb", size = 30617, upload-time = "2025-10-02T14:34:51.954Z" }, + { url = "https://files.pythonhosted.org/packages/13/5d/0d125536cbe7565a83d06e43783389ecae0c0f2ed037b48ede185de477c0/xxhash-3.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:c0f2ab8c715630565ab8991b536ecded9416d615538be8ecddce43ccf26cbc7c", size = 31534, upload-time = "2025-10-02T14:34:53.276Z" }, + { url = "https://files.pythonhosted.org/packages/54/85/6ec269b0952ec7e36ba019125982cf11d91256a778c7c3f98a4c5043d283/xxhash-3.6.0-cp312-cp312-win_arm64.whl", hash = "sha256:eae5c13f3bc455a3bbb68bdc513912dc7356de7e2280363ea235f71f54064829", size = 27876, upload-time = "2025-10-02T14:34:54.371Z" }, + { url = "https://files.pythonhosted.org/packages/33/76/35d05267ac82f53ae9b0e554da7c5e281ee61f3cad44c743f0fcd354f211/xxhash-3.6.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:599e64ba7f67472481ceb6ee80fa3bd828fd61ba59fb11475572cc5ee52b89ec", size = 32738, upload-time = "2025-10-02T14:34:55.839Z" }, + { url = "https://files.pythonhosted.org/packages/31/a8/3fbce1cd96534a95e35d5120637bf29b0d7f5d8fa2f6374e31b4156dd419/xxhash-3.6.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7d8b8aaa30fca4f16f0c84a5c8d7ddee0e25250ec2796c973775373257dde8f1", size = 30821, upload-time = "2025-10-02T14:34:57.219Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ea/d387530ca7ecfa183cb358027f1833297c6ac6098223fd14f9782cd0015c/xxhash-3.6.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d597acf8506d6e7101a4a44a5e428977a51c0fadbbfd3c39650cca9253f6e5a6", size = 194127, upload-time = "2025-10-02T14:34:59.21Z" }, + { url = "https://files.pythonhosted.org/packages/ba/0c/71435dcb99874b09a43b8d7c54071e600a7481e42b3e3ce1eb5226a5711a/xxhash-3.6.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:858dc935963a33bc33490128edc1c12b0c14d9c7ebaa4e387a7869ecc4f3e263", size = 212975, upload-time = "2025-10-02T14:35:00.816Z" }, + { url = "https://files.pythonhosted.org/packages/84/7a/c2b3d071e4bb4a90b7057228a99b10d51744878f4a8a6dd643c8bd897620/xxhash-3.6.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ba284920194615cb8edf73bf52236ce2e1664ccd4a38fdb543506413529cc546", size = 212241, upload-time = "2025-10-02T14:35:02.207Z" }, + { url = "https://files.pythonhosted.org/packages/81/5f/640b6eac0128e215f177df99eadcd0f1b7c42c274ab6a394a05059694c5a/xxhash-3.6.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4b54219177f6c6674d5378bd862c6aedf64725f70dd29c472eaae154df1a2e89", size = 445471, upload-time = "2025-10-02T14:35:03.61Z" }, + { url = "https://files.pythonhosted.org/packages/5e/1e/3c3d3ef071b051cc3abbe3721ffb8365033a172613c04af2da89d5548a87/xxhash-3.6.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:42c36dd7dbad2f5238950c377fcbf6811b1cdb1c444fab447960030cea60504d", size = 193936, upload-time = "2025-10-02T14:35:05.013Z" }, + { url = "https://files.pythonhosted.org/packages/2c/bd/4a5f68381939219abfe1c22a9e3a5854a4f6f6f3c4983a87d255f21f2e5d/xxhash-3.6.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f22927652cba98c44639ffdc7aaf35828dccf679b10b31c4ad72a5b530a18eb7", size = 210440, upload-time = "2025-10-02T14:35:06.239Z" }, + { url = "https://files.pythonhosted.org/packages/eb/37/b80fe3d5cfb9faff01a02121a0f4d565eb7237e9e5fc66e73017e74dcd36/xxhash-3.6.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b45fad44d9c5c119e9c6fbf2e1c656a46dc68e280275007bbfd3d572b21426db", size = 197990, upload-time = "2025-10-02T14:35:07.735Z" }, + { url = "https://files.pythonhosted.org/packages/d7/fd/2c0a00c97b9e18f72e1f240ad4e8f8a90fd9d408289ba9c7c495ed7dc05c/xxhash-3.6.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6f2580ffab1a8b68ef2b901cde7e55fa8da5e4be0977c68f78fc80f3c143de42", size = 210689, upload-time = "2025-10-02T14:35:09.438Z" }, + { url = "https://files.pythonhosted.org/packages/93/86/5dd8076a926b9a95db3206aba20d89a7fc14dd5aac16e5c4de4b56033140/xxhash-3.6.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:40c391dd3cd041ebc3ffe6f2c862f402e306eb571422e0aa918d8070ba31da11", size = 414068, upload-time = "2025-10-02T14:35:11.162Z" }, + { url = "https://files.pythonhosted.org/packages/af/3c/0bb129170ee8f3650f08e993baee550a09593462a5cddd8e44d0011102b1/xxhash-3.6.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f205badabde7aafd1a31e8ca2a3e5a763107a71c397c4481d6a804eb5063d8bd", size = 191495, upload-time = "2025-10-02T14:35:12.971Z" }, + { url = "https://files.pythonhosted.org/packages/e9/3a/6797e0114c21d1725e2577508e24006fd7ff1d8c0c502d3b52e45c1771d8/xxhash-3.6.0-cp313-cp313-win32.whl", hash = "sha256:2577b276e060b73b73a53042ea5bd5203d3e6347ce0d09f98500f418a9fcf799", size = 30620, upload-time = "2025-10-02T14:35:14.129Z" }, + { url = "https://files.pythonhosted.org/packages/86/15/9bc32671e9a38b413a76d24722a2bf8784a132c043063a8f5152d390b0f9/xxhash-3.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:757320d45d2fbcce8f30c42a6b2f47862967aea7bf458b9625b4bbe7ee390392", size = 31542, upload-time = "2025-10-02T14:35:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/39/c5/cc01e4f6188656e56112d6a8e0dfe298a16934b8c47a247236549a3f7695/xxhash-3.6.0-cp313-cp313-win_arm64.whl", hash = "sha256:457b8f85dec5825eed7b69c11ae86834a018b8e3df5e77783c999663da2f96d6", size = 27880, upload-time = "2025-10-02T14:35:16.315Z" }, + { url = "https://files.pythonhosted.org/packages/f3/30/25e5321c8732759e930c555176d37e24ab84365482d257c3b16362235212/xxhash-3.6.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a42e633d75cdad6d625434e3468126c73f13f7584545a9cf34e883aa1710e702", size = 32956, upload-time = "2025-10-02T14:35:17.413Z" }, + { url = "https://files.pythonhosted.org/packages/9f/3c/0573299560d7d9f8ab1838f1efc021a280b5ae5ae2e849034ef3dee18810/xxhash-3.6.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:568a6d743219e717b07b4e03b0a828ce593833e498c3b64752e0f5df6bfe84db", size = 31072, upload-time = "2025-10-02T14:35:18.844Z" }, + { url = "https://files.pythonhosted.org/packages/7a/1c/52d83a06e417cd9d4137722693424885cc9878249beb3a7c829e74bf7ce9/xxhash-3.6.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:bec91b562d8012dae276af8025a55811b875baace6af510412a5e58e3121bc54", size = 196409, upload-time = "2025-10-02T14:35:20.31Z" }, + { url = "https://files.pythonhosted.org/packages/e3/8e/c6d158d12a79bbd0b878f8355432075fc82759e356ab5a111463422a239b/xxhash-3.6.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:78e7f2f4c521c30ad5e786fdd6bae89d47a32672a80195467b5de0480aa97b1f", size = 215736, upload-time = "2025-10-02T14:35:21.616Z" }, + { url = "https://files.pythonhosted.org/packages/bc/68/c4c80614716345d55071a396cf03d06e34b5f4917a467faf43083c995155/xxhash-3.6.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3ed0df1b11a79856df5ffcab572cbd6b9627034c1c748c5566fa79df9048a7c5", size = 214833, upload-time = "2025-10-02T14:35:23.32Z" }, + { url = "https://files.pythonhosted.org/packages/7e/e9/ae27c8ffec8b953efa84c7c4a6c6802c263d587b9fc0d6e7cea64e08c3af/xxhash-3.6.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0e4edbfc7d420925b0dd5e792478ed393d6e75ff8fc219a6546fb446b6a417b1", size = 448348, upload-time = "2025-10-02T14:35:25.111Z" }, + { url = "https://files.pythonhosted.org/packages/d7/6b/33e21afb1b5b3f46b74b6bd1913639066af218d704cc0941404ca717fc57/xxhash-3.6.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fba27a198363a7ef87f8c0f6b171ec36b674fe9053742c58dd7e3201c1ab30ee", size = 196070, upload-time = "2025-10-02T14:35:26.586Z" }, + { url = "https://files.pythonhosted.org/packages/96/b6/fcabd337bc5fa624e7203aa0fa7d0c49eed22f72e93229431752bddc83d9/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:794fe9145fe60191c6532fa95063765529770edcdd67b3d537793e8004cabbfd", size = 212907, upload-time = "2025-10-02T14:35:28.087Z" }, + { url = "https://files.pythonhosted.org/packages/4b/d3/9ee6160e644d660fcf176c5825e61411c7f62648728f69c79ba237250143/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:6105ef7e62b5ac73a837778efc331a591d8442f8ef5c7e102376506cb4ae2729", size = 200839, upload-time = "2025-10-02T14:35:29.857Z" }, + { url = "https://files.pythonhosted.org/packages/0d/98/e8de5baa5109394baf5118f5e72ab21a86387c4f89b0e77ef3e2f6b0327b/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:f01375c0e55395b814a679b3eea205db7919ac2af213f4a6682e01220e5fe292", size = 213304, upload-time = "2025-10-02T14:35:31.222Z" }, + { url = "https://files.pythonhosted.org/packages/7b/1d/71056535dec5c3177eeb53e38e3d367dd1d16e024e63b1cee208d572a033/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:d706dca2d24d834a4661619dcacf51a75c16d65985718d6a7d73c1eeeb903ddf", size = 416930, upload-time = "2025-10-02T14:35:32.517Z" }, + { url = "https://files.pythonhosted.org/packages/dc/6c/5cbde9de2cd967c322e651c65c543700b19e7ae3e0aae8ece3469bf9683d/xxhash-3.6.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5f059d9faeacd49c0215d66f4056e1326c80503f51a1532ca336a385edadd033", size = 193787, upload-time = "2025-10-02T14:35:33.827Z" }, + { url = "https://files.pythonhosted.org/packages/19/fa/0172e350361d61febcea941b0cc541d6e6c8d65d153e85f850a7b256ff8a/xxhash-3.6.0-cp313-cp313t-win32.whl", hash = "sha256:1244460adc3a9be84731d72b8e80625788e5815b68da3da8b83f78115a40a7ec", size = 30916, upload-time = "2025-10-02T14:35:35.107Z" }, + { url = "https://files.pythonhosted.org/packages/ad/e6/e8cf858a2b19d6d45820f072eff1bea413910592ff17157cabc5f1227a16/xxhash-3.6.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b1e420ef35c503869c4064f4a2f2b08ad6431ab7b229a05cce39d74268bca6b8", size = 31799, upload-time = "2025-10-02T14:35:36.165Z" }, + { url = "https://files.pythonhosted.org/packages/56/15/064b197e855bfb7b343210e82490ae672f8bc7cdf3ddb02e92f64304ee8a/xxhash-3.6.0-cp313-cp313t-win_arm64.whl", hash = "sha256:ec44b73a4220623235f67a996c862049f375df3b1052d9899f40a6382c32d746", size = 28044, upload-time = "2025-10-02T14:35:37.195Z" }, + { url = "https://files.pythonhosted.org/packages/7e/5e/0138bc4484ea9b897864d59fce9be9086030825bc778b76cb5a33a906d37/xxhash-3.6.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a40a3d35b204b7cc7643cbcf8c9976d818cb47befcfac8bbefec8038ac363f3e", size = 32754, upload-time = "2025-10-02T14:35:38.245Z" }, + { url = "https://files.pythonhosted.org/packages/18/d7/5dac2eb2ec75fd771957a13e5dda560efb2176d5203f39502a5fc571f899/xxhash-3.6.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a54844be970d3fc22630b32d515e79a90d0a3ddb2644d8d7402e3c4c8da61405", size = 30846, upload-time = "2025-10-02T14:35:39.6Z" }, + { url = "https://files.pythonhosted.org/packages/fe/71/8bc5be2bb00deb5682e92e8da955ebe5fa982da13a69da5a40a4c8db12fb/xxhash-3.6.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:016e9190af8f0a4e3741343777710e3d5717427f175adfdc3e72508f59e2a7f3", size = 194343, upload-time = "2025-10-02T14:35:40.69Z" }, + { url = "https://files.pythonhosted.org/packages/e7/3b/52badfb2aecec2c377ddf1ae75f55db3ba2d321c5e164f14461c90837ef3/xxhash-3.6.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4f6f72232f849eb9d0141e2ebe2677ece15adfd0fa599bc058aad83c714bb2c6", size = 213074, upload-time = "2025-10-02T14:35:42.29Z" }, + { url = "https://files.pythonhosted.org/packages/a2/2b/ae46b4e9b92e537fa30d03dbc19cdae57ed407e9c26d163895e968e3de85/xxhash-3.6.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:63275a8aba7865e44b1813d2177e0f5ea7eadad3dd063a21f7cf9afdc7054063", size = 212388, upload-time = "2025-10-02T14:35:43.929Z" }, + { url = "https://files.pythonhosted.org/packages/f5/80/49f88d3afc724b4ac7fbd664c8452d6db51b49915be48c6982659e0e7942/xxhash-3.6.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3cd01fa2aa00d8b017c97eb46b9a794fbdca53fc14f845f5a328c71254b0abb7", size = 445614, upload-time = "2025-10-02T14:35:45.216Z" }, + { url = "https://files.pythonhosted.org/packages/ed/ba/603ce3961e339413543d8cd44f21f2c80e2a7c5cfe692a7b1f2cccf58f3c/xxhash-3.6.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0226aa89035b62b6a86d3c68df4d7c1f47a342b8683da2b60cedcddb46c4d95b", size = 194024, upload-time = "2025-10-02T14:35:46.959Z" }, + { url = "https://files.pythonhosted.org/packages/78/d1/8e225ff7113bf81545cfdcd79eef124a7b7064a0bba53605ff39590b95c2/xxhash-3.6.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c6e193e9f56e4ca4923c61238cdaced324f0feac782544eb4c6d55ad5cc99ddd", size = 210541, upload-time = "2025-10-02T14:35:48.301Z" }, + { url = "https://files.pythonhosted.org/packages/6f/58/0f89d149f0bad89def1a8dd38feb50ccdeb643d9797ec84707091d4cb494/xxhash-3.6.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:9176dcaddf4ca963d4deb93866d739a343c01c969231dbe21680e13a5d1a5bf0", size = 198305, upload-time = "2025-10-02T14:35:49.584Z" }, + { url = "https://files.pythonhosted.org/packages/11/38/5eab81580703c4df93feb5f32ff8fa7fe1e2c51c1f183ee4e48d4bb9d3d7/xxhash-3.6.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c1ce4009c97a752e682b897aa99aef84191077a9433eb237774689f14f8ec152", size = 210848, upload-time = "2025-10-02T14:35:50.877Z" }, + { url = "https://files.pythonhosted.org/packages/5e/6b/953dc4b05c3ce678abca756416e4c130d2382f877a9c30a20d08ee6a77c0/xxhash-3.6.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:8cb2f4f679b01513b7adbb9b1b2f0f9cdc31b70007eaf9d59d0878809f385b11", size = 414142, upload-time = "2025-10-02T14:35:52.15Z" }, + { url = "https://files.pythonhosted.org/packages/08/a9/238ec0d4e81a10eb5026d4a6972677cbc898ba6c8b9dbaec12ae001b1b35/xxhash-3.6.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:653a91d7c2ab54a92c19ccf43508b6a555440b9be1bc8be553376778be7f20b5", size = 191547, upload-time = "2025-10-02T14:35:53.547Z" }, + { url = "https://files.pythonhosted.org/packages/f1/ee/3cf8589e06c2164ac77c3bf0aa127012801128f1feebf2a079272da5737c/xxhash-3.6.0-cp314-cp314-win32.whl", hash = "sha256:a756fe893389483ee8c394d06b5ab765d96e68fbbfe6fde7aa17e11f5720559f", size = 31214, upload-time = "2025-10-02T14:35:54.746Z" }, + { url = "https://files.pythonhosted.org/packages/02/5d/a19552fbc6ad4cb54ff953c3908bbc095f4a921bc569433d791f755186f1/xxhash-3.6.0-cp314-cp314-win_amd64.whl", hash = "sha256:39be8e4e142550ef69629c9cd71b88c90e9a5db703fecbcf265546d9536ca4ad", size = 32290, upload-time = "2025-10-02T14:35:55.791Z" }, + { url = "https://files.pythonhosted.org/packages/b1/11/dafa0643bc30442c887b55baf8e73353a344ee89c1901b5a5c54a6c17d39/xxhash-3.6.0-cp314-cp314-win_arm64.whl", hash = "sha256:25915e6000338999236f1eb68a02a32c3275ac338628a7eaa5a269c401995679", size = 28795, upload-time = "2025-10-02T14:35:57.162Z" }, + { url = "https://files.pythonhosted.org/packages/2c/db/0e99732ed7f64182aef4a6fb145e1a295558deec2a746265dcdec12d191e/xxhash-3.6.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c5294f596a9017ca5a3e3f8884c00b91ab2ad2933cf288f4923c3fd4346cf3d4", size = 32955, upload-time = "2025-10-02T14:35:58.267Z" }, + { url = "https://files.pythonhosted.org/packages/55/f4/2a7c3c68e564a099becfa44bb3d398810cc0ff6749b0d3cb8ccb93f23c14/xxhash-3.6.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1cf9dcc4ab9cff01dfbba78544297a3a01dafd60f3bde4e2bfd016cf7e4ddc67", size = 31072, upload-time = "2025-10-02T14:35:59.382Z" }, + { url = "https://files.pythonhosted.org/packages/c6/d9/72a29cddc7250e8a5819dad5d466facb5dc4c802ce120645630149127e73/xxhash-3.6.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:01262da8798422d0685f7cef03b2bd3f4f46511b02830861df548d7def4402ad", size = 196579, upload-time = "2025-10-02T14:36:00.838Z" }, + { url = "https://files.pythonhosted.org/packages/63/93/b21590e1e381040e2ca305a884d89e1c345b347404f7780f07f2cdd47ef4/xxhash-3.6.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51a73fb7cb3a3ead9f7a8b583ffd9b8038e277cdb8cb87cf890e88b3456afa0b", size = 215854, upload-time = "2025-10-02T14:36:02.207Z" }, + { url = "https://files.pythonhosted.org/packages/ce/b8/edab8a7d4fa14e924b29be877d54155dcbd8b80be85ea00d2be3413a9ed4/xxhash-3.6.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b9c6df83594f7df8f7f708ce5ebeacfc69f72c9fbaaababf6cf4758eaada0c9b", size = 214965, upload-time = "2025-10-02T14:36:03.507Z" }, + { url = "https://files.pythonhosted.org/packages/27/67/dfa980ac7f0d509d54ea0d5a486d2bb4b80c3f1bb22b66e6a05d3efaf6c0/xxhash-3.6.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:627f0af069b0ea56f312fd5189001c24578868643203bca1abbc2c52d3a6f3ca", size = 448484, upload-time = "2025-10-02T14:36:04.828Z" }, + { url = "https://files.pythonhosted.org/packages/8c/63/8ffc2cc97e811c0ca5d00ab36604b3ea6f4254f20b7bc658ca825ce6c954/xxhash-3.6.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aa912c62f842dfd013c5f21a642c9c10cd9f4c4e943e0af83618b4a404d9091a", size = 196162, upload-time = "2025-10-02T14:36:06.182Z" }, + { url = "https://files.pythonhosted.org/packages/4b/77/07f0e7a3edd11a6097e990f6e5b815b6592459cb16dae990d967693e6ea9/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:b465afd7909db30168ab62afe40b2fcf79eedc0b89a6c0ab3123515dc0df8b99", size = 213007, upload-time = "2025-10-02T14:36:07.733Z" }, + { url = "https://files.pythonhosted.org/packages/ae/d8/bc5fa0d152837117eb0bef6f83f956c509332ce133c91c63ce07ee7c4873/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:a881851cf38b0a70e7c4d3ce81fc7afd86fbc2a024f4cfb2a97cf49ce04b75d3", size = 200956, upload-time = "2025-10-02T14:36:09.106Z" }, + { url = "https://files.pythonhosted.org/packages/26/a5/d749334130de9411783873e9b98ecc46688dad5db64ca6e04b02acc8b473/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9b3222c686a919a0f3253cfc12bb118b8b103506612253b5baeaac10d8027cf6", size = 213401, upload-time = "2025-10-02T14:36:10.585Z" }, + { url = "https://files.pythonhosted.org/packages/89/72/abed959c956a4bfc72b58c0384bb7940663c678127538634d896b1195c10/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:c5aa639bc113e9286137cec8fadc20e9cd732b2cc385c0b7fa673b84fc1f2a93", size = 417083, upload-time = "2025-10-02T14:36:12.276Z" }, + { url = "https://files.pythonhosted.org/packages/0c/b3/62fd2b586283b7d7d665fb98e266decadf31f058f1cf6c478741f68af0cb/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5c1343d49ac102799905e115aee590183c3921d475356cb24b4de29a4bc56518", size = 193913, upload-time = "2025-10-02T14:36:14.025Z" }, + { url = "https://files.pythonhosted.org/packages/9a/9a/c19c42c5b3f5a4aad748a6d5b4f23df3bed7ee5445accc65a0fb3ff03953/xxhash-3.6.0-cp314-cp314t-win32.whl", hash = "sha256:5851f033c3030dd95c086b4a36a2683c2ff4a799b23af60977188b057e467119", size = 31586, upload-time = "2025-10-02T14:36:15.603Z" }, + { url = "https://files.pythonhosted.org/packages/03/d6/4cc450345be9924fd5dc8c590ceda1db5b43a0a889587b0ae81a95511360/xxhash-3.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0444e7967dac37569052d2409b00a8860c2135cff05502df4da80267d384849f", size = 32526, upload-time = "2025-10-02T14:36:16.708Z" }, + { url = "https://files.pythonhosted.org/packages/0f/c9/7243eb3f9eaabd1a88a5a5acadf06df2d83b100c62684b7425c6a11bcaa8/xxhash-3.6.0-cp314-cp314t-win_arm64.whl", hash = "sha256:bb79b1e63f6fd84ec778a4b1916dfe0a7c3fdb986c06addd5db3a0d413819d95", size = 28898, upload-time = "2025-10-02T14:36:17.843Z" }, + { url = "https://files.pythonhosted.org/packages/93/1e/8aec23647a34a249f62e2398c42955acd9b4c6ed5cf08cbea94dc46f78d2/xxhash-3.6.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0f7b7e2ec26c1666ad5fc9dbfa426a6a3367ceaf79db5dd76264659d509d73b0", size = 30662, upload-time = "2025-10-02T14:37:01.743Z" }, + { url = "https://files.pythonhosted.org/packages/b8/0b/b14510b38ba91caf43006209db846a696ceea6a847a0c9ba0a5b1adc53d6/xxhash-3.6.0-pp311-pypy311_pp73-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5dc1e14d14fa0f5789ec29a7062004b5933964bb9b02aae6622b8f530dc40296", size = 41056, upload-time = "2025-10-02T14:37:02.879Z" }, + { url = "https://files.pythonhosted.org/packages/50/55/15a7b8a56590e66ccd374bbfa3f9ffc45b810886c8c3b614e3f90bd2367c/xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:881b47fc47e051b37d94d13e7455131054b56749b91b508b0907eb07900d1c13", size = 36251, upload-time = "2025-10-02T14:37:04.44Z" }, + { url = "https://files.pythonhosted.org/packages/62/b2/5ac99a041a29e58e95f907876b04f7067a0242cb85b5f39e726153981503/xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c6dc31591899f5e5666f04cc2e529e69b4072827085c1ef15294d91a004bc1bd", size = 32481, upload-time = "2025-10-02T14:37:05.869Z" }, + { url = "https://files.pythonhosted.org/packages/7b/d9/8d95e906764a386a3d3b596f3c68bb63687dfca806373509f51ce8eea81f/xxhash-3.6.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:15e0dac10eb9309508bfc41f7f9deaa7755c69e35af835db9cb10751adebc35d", size = 31565, upload-time = "2025-10-02T14:37:06.966Z" }, +] + [[package]] name = "yarl" version = "1.20.0"