Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 226 additions & 0 deletions pokemon_v2/test_models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import csv
import os
import re
from django.conf import settings
from django.test import TestCase
from pokemon_v2.models import *

Expand All @@ -9,3 +13,225 @@ def setUp(self):
def fields_are_valid(self):
smell = Ability.objects.get(name="Smell")
self.assertEqual(smell.generation_id, 3)


class CSVResourceNameValidationTestCase(TestCase):
"""
Test that all resource identifiers in CSV files follow ASCII slug format.

Resource identifiers are used in API URLs and should be URL-safe ASCII slugs
(lowercase letters, numbers, and hyphens only).

This test validates the data source (CSV files) before it's loaded into the database.
"""

# Pattern for valid resource identifiers: lowercase letters, numbers, and hyphens only
VALID_IDENTIFIER_PATTERN = re.compile(r"^[a-z0-9-]+$")

# CSV files with 'identifier' column to validate
CSV_FILES_TO_VALIDATE = [
"abilities.csv",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, I prefer an approch where all files are scanned and if a column identifier is found, then, we check it.

"berry_firmness.csv",
"conquest_episodes.csv",
"conquest_kingdoms.csv",
"conquest_move_displacements.csv",
"conquest_move_ranges.csv",
"conquest_stats.csv",
"conquest_warrior_archetypes.csv",
"conquest_warrior_skills.csv",
"conquest_warrior_stats.csv",
"conquest_warriors.csv",
"contest_types.csv",
"egg_groups.csv",
"encounter_conditions.csv",
"encounter_condition_values.csv",
"encounter_methods.csv",
"evolution_triggers.csv",
"genders.csv",
"generations.csv",
"growth_rates.csv",
"items.csv",
"item_categories.csv",
"item_flags.csv",
"item_fling_effects.csv",
"item_pockets.csv",
"languages.csv",
"locations.csv",
"location_areas.csv",
"moves.csv",
"move_battle_styles.csv",
"move_damage_classes.csv",
"move_flags.csv",
"move_meta_ailments.csv",
"move_meta_categories.csv",
"move_targets.csv",
"natures.csv",
"pal_park_areas.csv",
"pokeathlon_stats.csv",
"pokedexes.csv",
"pokemon.csv",
"pokemon_colors.csv",
"pokemon_forms.csv",
"pokemon_habitats.csv",
"pokemon_move_methods.csv",
"pokemon_shapes.csv",
"pokemon_species.csv",
"regions.csv",
"stats.csv",
"types.csv",
"versions.csv",
"version_groups.csv",
]

def test_all_csv_identifiers_are_ascii_slugs(self):
"""
Validate that all resource identifiers in CSV files follow the ASCII slug format.

Identifiers should only contain:
- Lowercase letters (a-z)
- Numbers (0-9)
- Hyphens (-)

This test will fail if any CSV contains identifiers with:
- Unicode characters (ñ, ', é, etc.)
- Uppercase letters
- Spaces
- Special characters (&, (), ', etc.)
"""
violations = []
missing_files = []

for filename in self.CSV_FILES_TO_VALIDATE:
csv_path = os.path.join(settings.BASE_DIR, "data", "v2", "csv", filename)

# Track missing files to report at the end
if not os.path.exists(csv_path):
missing_files.append(filename)
continue

try:
with open(csv_path, "r", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)

# Check if the identifier column exists
if "identifier" not in reader.fieldnames:
violations.append(
{
"file": filename,
"row": "N/A",
"id": "N/A",
"identifier": "Column 'identifier' not found",
}
)
continue

for row_num, row in enumerate(reader, start=2):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why start=2?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To skip checking the header line

identifier = row.get("identifier", "").strip()

# Skip empty identifiers
if not identifier:
continue

# Check if identifier matches the pattern
if not self.VALID_IDENTIFIER_PATTERN.match(identifier):
violations.append(
{
"file": filename,
"row": row_num,
"id": row.get("id", "N/A"),
"identifier": identifier,
}
)

except Exception as e:
violations.append(
{
"file": filename,
"row": "N/A",
"id": "N/A",
"identifier": f"Error reading file: {str(e)}",
}
)

# If there are violations or missing files, create a detailed error message
if violations or missing_files:
error_lines = []

# Report missing files first
if missing_files:
error_lines.append("\n\nMissing CSV files:")
for filename in missing_files:
error_lines.append(f" - {filename}")
error_lines.append(
"\nAll CSV files listed in CSV_FILES_TO_VALIDATE must exist."
)

# Report violations
if violations:
error_lines.append(
"\n\nFound {} resource(s) with invalid identifiers (not ASCII slugs):".format(
len(violations)
)
)
error_lines.append("\nIdentifiers must match pattern: ^[a-z0-9-]+$")
error_lines.append("\nInvalid identifiers found in CSV files:")

for v in violations:
error_lines.append(
" - {file} (row {row}, id={id}): {identifier}".format(**v)
)

error_lines.append(
"\nThese identifiers contain invalid characters and must be normalized."
)
error_lines.append(
"Update the CSV files in data/v2/csv/ to fix these identifiers."
)
error_lines.append("\nSuggested fixes:")
error_lines.append(
" - Remove Unicode apostrophes (') and replace with regular hyphens or remove"
)
error_lines.append(" - Remove Unicode letters (ñ → n)")
error_lines.append(
" - Remove parentheses and other special characters"
)
error_lines.append(" - Convert to lowercase")

self.fail("\n".join(error_lines))

def test_identifier_pattern_examples(self):
"""Test that the validation pattern works correctly with example identifiers."""
# Valid identifiers
valid_identifiers = [
"pikachu",
"charizard-mega-x",
"mr-mime",
"ho-oh",
"type-null",
"item-123",
"mega-stone",
]

for identifier in valid_identifiers:
self.assertTrue(
self.VALID_IDENTIFIER_PATTERN.match(identifier),
f"{identifier} should be valid but was rejected",
)

# Invalid identifiers
invalid_identifiers = [
"Pikachu", # Uppercase
"Mr. Mime", # Space and period
"kofu's-wallet", # Unicode apostrophe
"jalapeño", # Unicode ñ
"steel-bottle-(r)", # Parentheses
"b&w-grass-tablecloth", # Ampersand
"farfetch'd", # Apostrophe
"kofu's-wallet", # Regular apostrophe
]

for identifier in invalid_identifiers:
self.assertFalse(
self.VALID_IDENTIFIER_PATTERN.match(identifier),
f"{identifier} should be invalid but was accepted",
)
Loading