From 61568ec768d1624306c948d6bde1c79021fa0c0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc?= <loic.outhier.work@gmail.com>
Date: Thu, 12 Mar 2026 10:18:36 +0100
Subject: [PATCH 1/2] Add CSV identifier validation with comprehensive error
 reporting

- Validates 50 CSV files with identifier columns
- Reports missing files and invalid identifiers together
- Found 16 invalid identifiers across items, locations, and move_meta_categories
---
 pokemon_v2/test_models.py | 235 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 235 insertions(+)

diff --git a/pokemon_v2/test_models.py b/pokemon_v2/test_models.py
index 84f2dbf72..891799da8 100644
--- a/pokemon_v2/test_models.py
+++ b/pokemon_v2/test_models.py
@@ -1,3 +1,6 @@
+import csv
+import os
+import re
 from django.test import TestCase
 from pokemon_v2.models import *
 
@@ -9,3 +12,235 @@ def setUp(self):
     def fields_are_valid(self):
         smell = Ability.objects.get(name="Smell")
         self.assertEqual(smell.generation_id, 3)
+
+
+class CSVResourceNameValidationTestCase(TestCase):
+    """
+    Test that all resource identifiers in CSV files follow ASCII slug format.
+
+    Resource identifiers are used in API URLs and should be URL-safe ASCII slugs
+    (lowercase letters, numbers, and hyphens only).
+
+    This test validates the data source (CSV files) before it's loaded into the database.
+    """
+
+    # Pattern for valid resource identifiers: lowercase letters, numbers, and hyphens only
+    VALID_IDENTIFIER_PATTERN = re.compile(r"^[a-z0-9-]+$")
+
+    # CSV files that contain an 'identifier' column to validate
+    # Format: (filename, identifier_column_name)
+    CSV_FILES_TO_VALIDATE = [
+        ("abilities.csv", "identifier"),
+        ("berry_firmness.csv", "identifier"),
+        ("conquest_episodes.csv", "identifier"),
+        ("conquest_kingdoms.csv", "identifier"),
+        ("conquest_move_displacements.csv", "identifier"),
+        ("conquest_move_ranges.csv", "identifier"),
+        ("conquest_stats.csv", "identifier"),
+        ("conquest_warrior_archetypes.csv", "identifier"),
+        ("conquest_warrior_skills.csv", "identifier"),
+        ("conquest_warrior_stats.csv", "identifier"),
+        ("conquest_warriors.csv", "identifier"),
+        ("contest_types.csv", "identifier"),
+        ("egg_groups.csv", "identifier"),
+        ("encounter_conditions.csv", "identifier"),
+        ("encounter_condition_values.csv", "identifier"),
+        ("encounter_methods.csv", "identifier"),
+        ("evolution_triggers.csv", "identifier"),
+        ("genders.csv", "identifier"),
+        ("generations.csv", "identifier"),
+        ("growth_rates.csv", "identifier"),
+        ("items.csv", "identifier"),
+        ("item_categories.csv", "identifier"),
+        ("item_flags.csv", "identifier"),
+        ("item_fling_effects.csv", "identifier"),
+        ("item_pockets.csv", "identifier"),
+        ("languages.csv", "identifier"),
+        ("locations.csv", "identifier"),
+        ("location_areas.csv", "identifier"),
+        ("moves.csv", "identifier"),
+        ("move_battle_styles.csv", "identifier"),
+        ("move_damage_classes.csv", "identifier"),
+        ("move_flags.csv", "identifier"),
+        ("move_meta_ailments.csv", "identifier"),
+        ("move_meta_categories.csv", "identifier"),
+        ("move_targets.csv", "identifier"),
+        ("natures.csv", "identifier"),
+        ("pal_park_areas.csv", "identifier"),
+        ("pokeathlon_stats.csv", "identifier"),
+        ("pokedexes.csv", "identifier"),
+        ("pokemon.csv", "identifier"),
+        ("pokemon_colors.csv", "identifier"),
+        ("pokemon_forms.csv", "identifier"),
+        ("pokemon_habitats.csv", "identifier"),
+        ("pokemon_move_methods.csv", "identifier"),
+        ("pokemon_shapes.csv", "identifier"),
+        ("pokemon_species.csv", "identifier"),
+        ("regions.csv", "identifier"),
+        ("stats.csv", "identifier"),
+        ("types.csv", "identifier"),
+        ("versions.csv", "identifier"),
+        ("version_groups.csv", "identifier"),
+    ]
+
+    def get_csv_path(self, filename):
+        """Get the absolute path to a CSV file in data/v2/csv/"""
+        from django.conf import settings
+
+        base_dir = settings.BASE_DIR
+        return os.path.join(base_dir, "data", "v2", "csv", filename)
+
+    def test_all_csv_identifiers_are_ascii_slugs(self):
+        """
+        Validate that all resource identifiers in CSV files follow the ASCII slug format.
+
+        Identifiers should only contain:
+        - Lowercase letters (a-z)
+        - Numbers (0-9)
+        - Hyphens (-)
+
+        This test will fail if any CSV contains identifiers with:
+        - Unicode characters (ñ, ', é, etc.)
+        - Uppercase letters
+        - Spaces
+        - Special characters (&, (), ', etc.)
+        """
+        violations = []
+        missing_files = []
+
+        for filename, identifier_column in self.CSV_FILES_TO_VALIDATE:
+            csv_path = self.get_csv_path(filename)
+
+            # Track missing files to report at the end
+            if not os.path.exists(csv_path):
+                missing_files.append(filename)
+                continue
+
+            try:
+                with open(csv_path, "r", encoding="utf-8") as csvfile:
+                    reader = csv.DictReader(csvfile)
+
+                    # Check if the identifier column exists
+                    if identifier_column not in reader.fieldnames:
+                        violations.append(
+                            {
+                                "file": filename,
+                                "row": "N/A",
+                                "id": "N/A",
+                                "identifier": f"Column '{identifier_column}' not found",
+                            }
+                        )
+                        continue
+
+                    for row_num, row in enumerate(
+                        reader, start=2
+                    ):  # Start at 2 (after header)
+                        identifier = row.get(identifier_column, "").strip()
+
+                        # Skip empty identifiers
+                        if not identifier:
+                            continue
+
+                        # Check if identifier matches the pattern
+                        if not self.VALID_IDENTIFIER_PATTERN.match(identifier):
+                            violations.append(
+                                {
+                                    "file": filename,
+                                    "row": row_num,
+                                    "id": row.get("id", "N/A"),
+                                    "identifier": identifier,
+                                }
+                            )
+
+            except Exception as e:
+                violations.append(
+                    {
+                        "file": filename,
+                        "row": "N/A",
+                        "id": "N/A",
+                        "identifier": f"Error reading file: {str(e)}",
+                    }
+                )
+
+        # If there are violations or missing files, create a detailed error message
+        if violations or missing_files:
+            error_lines = []
+
+            # Report missing files first
+            if missing_files:
+                error_lines.append("\n\nMissing CSV files:")
+                for filename in missing_files:
+                    error_lines.append(f"  - {filename}")
+                error_lines.append(
+                    "\nAll CSV files listed in CSV_FILES_TO_VALIDATE must exist."
+                )
+
+            # Report violations
+            if violations:
+                error_lines.append(
+                    "\n\nFound {} resource(s) with invalid identifiers (not ASCII slugs):".format(
+                        len(violations)
+                    )
+                )
+                error_lines.append("\nIdentifiers must match pattern: ^[a-z0-9-]+$")
+                error_lines.append("\nInvalid identifiers found in CSV files:")
+
+                for v in violations:
+                    error_lines.append(
+                        "  - {file} (row {row}, id={id}): {identifier}".format(**v)
+                    )
+
+                error_lines.append(
+                    "\nThese identifiers contain invalid characters and must be normalized."
+                )
+                error_lines.append(
+                    "Update the CSV files in data/v2/csv/ to fix these identifiers."
+                )
+                error_lines.append("\nSuggested fixes:")
+                error_lines.append(
+                    "  - Remove Unicode apostrophes (') and replace with regular hyphens or remove"
+                )
+                error_lines.append("  - Remove Unicode letters (ñ → n)")
+                error_lines.append(
+                    "  - Remove parentheses and other special characters"
+                )
+                error_lines.append("  - Convert to lowercase")
+
+            self.fail("\n".join(error_lines))
+
+    def test_identifier_pattern_examples(self):
+        """Test that the validation pattern works correctly with example identifiers."""
+        # Valid identifiers
+        valid_identifiers = [
+            "pikachu",
+            "charizard-mega-x",
+            "mr-mime",
+            "ho-oh",
+            "type-null",
+            "item-123",
+            "mega-stone",
+        ]
+
+        for identifier in valid_identifiers:
+            self.assertTrue(
+                self.VALID_IDENTIFIER_PATTERN.match(identifier),
+                f"{identifier} should be valid but was rejected",
+            )
+
+        # Invalid identifiers
+        invalid_identifiers = [
+            "Pikachu",  # Uppercase
+            "Mr. Mime",  # Space and period
+            "kofu's-wallet",  # Unicode apostrophe
+            "jalapeño",  # Unicode ñ
+            "steel-bottle-(r)",  # Parentheses
+            "b&w-grass-tablecloth",  # Ampersand
+            "farfetch'd",  # Apostrophe
+            "kofu's-wallet",  # Regular apostrophe
+        ]
+
+        for identifier in invalid_identifiers:
+            self.assertFalse(
+                self.VALID_IDENTIFIER_PATTERN.match(identifier),
+                f"{identifier} should be invalid but was accepted",
+            )

From 7c6f0346b8ec25527e00f8cdbf2950ad7ff15708 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc?= <loic.outhier.work@gmail.com>
Date: Thu, 12 Mar 2026 15:22:21 +0100
Subject: [PATCH 2/2] Refactor CSV validation based on maintainer feedback

---
 pokemon_v2/test_models.py | 127 ++++++++++++++++++--------------------
 1 file changed, 59 insertions(+), 68 deletions(-)

diff --git a/pokemon_v2/test_models.py b/pokemon_v2/test_models.py
index 891799da8..559413f54 100644
--- a/pokemon_v2/test_models.py
+++ b/pokemon_v2/test_models.py
@@ -1,6 +1,7 @@
 import csv
 import os
 import re
+from django.conf import settings
 from django.test import TestCase
 from pokemon_v2.models import *
 
@@ -27,69 +28,61 @@ class CSVResourceNameValidationTestCase(TestCase):
     # Pattern for valid resource identifiers: lowercase letters, numbers, and hyphens only
     VALID_IDENTIFIER_PATTERN = re.compile(r"^[a-z0-9-]+$")
 
-    # CSV files that contain an 'identifier' column to validate
-    # Format: (filename, identifier_column_name)
+    # CSV files with 'identifier' column to validate
     CSV_FILES_TO_VALIDATE = [
-        ("abilities.csv", "identifier"),
-        ("berry_firmness.csv", "identifier"),
-        ("conquest_episodes.csv", "identifier"),
-        ("conquest_kingdoms.csv", "identifier"),
-        ("conquest_move_displacements.csv", "identifier"),
-        ("conquest_move_ranges.csv", "identifier"),
-        ("conquest_stats.csv", "identifier"),
-        ("conquest_warrior_archetypes.csv", "identifier"),
-        ("conquest_warrior_skills.csv", "identifier"),
-        ("conquest_warrior_stats.csv", "identifier"),
-        ("conquest_warriors.csv", "identifier"),
-        ("contest_types.csv", "identifier"),
-        ("egg_groups.csv", "identifier"),
-        ("encounter_conditions.csv", "identifier"),
-        ("encounter_condition_values.csv", "identifier"),
-        ("encounter_methods.csv", "identifier"),
-        ("evolution_triggers.csv", "identifier"),
-        ("genders.csv", "identifier"),
-        ("generations.csv", "identifier"),
-        ("growth_rates.csv", "identifier"),
-        ("items.csv", "identifier"),
-        ("item_categories.csv", "identifier"),
-        ("item_flags.csv", "identifier"),
-        ("item_fling_effects.csv", "identifier"),
-        ("item_pockets.csv", "identifier"),
-        ("languages.csv", "identifier"),
-        ("locations.csv", "identifier"),
-        ("location_areas.csv", "identifier"),
-        ("moves.csv", "identifier"),
-        ("move_battle_styles.csv", "identifier"),
-        ("move_damage_classes.csv", "identifier"),
-        ("move_flags.csv", "identifier"),
-        ("move_meta_ailments.csv", "identifier"),
-        ("move_meta_categories.csv", "identifier"),
-        ("move_targets.csv", "identifier"),
-        ("natures.csv", "identifier"),
-        ("pal_park_areas.csv", "identifier"),
-        ("pokeathlon_stats.csv", "identifier"),
-        ("pokedexes.csv", "identifier"),
-        ("pokemon.csv", "identifier"),
-        ("pokemon_colors.csv", "identifier"),
-        ("pokemon_forms.csv", "identifier"),
-        ("pokemon_habitats.csv", "identifier"),
-        ("pokemon_move_methods.csv", "identifier"),
-        ("pokemon_shapes.csv", "identifier"),
-        ("pokemon_species.csv", "identifier"),
-        ("regions.csv", "identifier"),
-        ("stats.csv", "identifier"),
-        ("types.csv", "identifier"),
-        ("versions.csv", "identifier"),
-        ("version_groups.csv", "identifier"),
+        "abilities.csv",
+        "berry_firmness.csv",
+        "conquest_episodes.csv",
+        "conquest_kingdoms.csv",
+        "conquest_move_displacements.csv",
+        "conquest_move_ranges.csv",
+        "conquest_stats.csv",
+        "conquest_warrior_archetypes.csv",
+        "conquest_warrior_skills.csv",
+        "conquest_warrior_stats.csv",
+        "conquest_warriors.csv",
+        "contest_types.csv",
+        "egg_groups.csv",
+        "encounter_conditions.csv",
+        "encounter_condition_values.csv",
+        "encounter_methods.csv",
+        "evolution_triggers.csv",
+        "genders.csv",
+        "generations.csv",
+        "growth_rates.csv",
+        "items.csv",
+        "item_categories.csv",
+        "item_flags.csv",
+        "item_fling_effects.csv",
+        "item_pockets.csv",
+        "languages.csv",
+        "locations.csv",
+        "location_areas.csv",
+        "moves.csv",
+        "move_battle_styles.csv",
+        "move_damage_classes.csv",
+        "move_flags.csv",
+        "move_meta_ailments.csv",
+        "move_meta_categories.csv",
+        "move_targets.csv",
+        "natures.csv",
+        "pal_park_areas.csv",
+        "pokeathlon_stats.csv",
+        "pokedexes.csv",
+        "pokemon.csv",
+        "pokemon_colors.csv",
+        "pokemon_forms.csv",
+        "pokemon_habitats.csv",
+        "pokemon_move_methods.csv",
+        "pokemon_shapes.csv",
+        "pokemon_species.csv",
+        "regions.csv",
+        "stats.csv",
+        "types.csv",
+        "versions.csv",
+        "version_groups.csv",
     ]
 
-    def get_csv_path(self, filename):
-        """Get the absolute path to a CSV file in data/v2/csv/"""
-        from django.conf import settings
-
-        base_dir = settings.BASE_DIR
-        return os.path.join(base_dir, "data", "v2", "csv", filename)
-
     def test_all_csv_identifiers_are_ascii_slugs(self):
         """
         Validate that all resource identifiers in CSV files follow the ASCII slug format.
@@ -108,8 +101,8 @@ def test_all_csv_identifiers_are_ascii_slugs(self):
         violations = []
         missing_files = []
 
-        for filename, identifier_column in self.CSV_FILES_TO_VALIDATE:
-            csv_path = self.get_csv_path(filename)
+        for filename in self.CSV_FILES_TO_VALIDATE:
+            csv_path = os.path.join(settings.BASE_DIR, "data", "v2", "csv", filename)
 
             # Track missing files to report at the end
             if not os.path.exists(csv_path):
@@ -121,21 +114,19 @@ def test_all_csv_identifiers_are_ascii_slugs(self):
                     reader = csv.DictReader(csvfile)
 
                     # Check if the identifier column exists
-                    if identifier_column not in reader.fieldnames:
+                    if "identifier" not in reader.fieldnames:
                         violations.append(
                             {
                                 "file": filename,
                                 "row": "N/A",
                                 "id": "N/A",
-                                "identifier": f"Column '{identifier_column}' not found",
+                                "identifier": "Column 'identifier' not found",
                             }
                         )
                         continue
 
-                    for row_num, row in enumerate(
-                        reader, start=2
-                    ):  # Start at 2 (after header)
-                        identifier = row.get(identifier_column, "").strip()
+                    for row_num, row in enumerate(reader, start=2):
+                        identifier = row.get("identifier", "").strip()
 
                         # Skip empty identifiers
                         if not identifier: