diff --git a/scripts/world_bank/wdi/README.md b/scripts/world_bank/wdi/README.md index ef1f0f5dc6..239d0f7277 100644 --- a/scripts/world_bank/wdi/README.md +++ b/scripts/world_bank/wdi/README.md @@ -146,5 +146,24 @@ If you want to perform "only download", run the below command: python3 worldbank.py --mode=download ``` +### Added golden files and increased the threshold with golden checks in validation_config.json. + +The `GOLDENS_CHECK` validator confirms that the import includes a specific set of expected records. This is useful for verifying that critical StatVars, Places, or specific metadata combinations are consistently present in the output. + +The validator compares the input data (usually from the stats data source) against one or more "golden" files (MCF or CSV). + +If any combination of values in a golden file row is missing from the input, the validation fails. The missing golden rows are then listed in the validation report JSON. + +If you want to get goldens, run the below command: +```bash +#goldens from output csv +python3 validator_goldens.py --validate_goldens_input=../../scripts/world_bank/wdi/output/WorldBank.csv --generate_goldens=golden_data/golden_observations.csv --goldens_must_include="ISO3166Alpha3:gs://unresolved_mcf/import_validation/top_100k_places.csv" --generate_goldens_property_sets="ISO3166Alpha3" +``` + +#goldens from summary reports +```bash +python3 validator_goldens.py --validate_goldens_input="summary_report.csv" --generate_goldens=golden_data/golden_summary_report.csv --generate_goldens_property_sets="StatVar|Units|MinDate|MeasurementMethods|observationPeriod" +``` + We highly recommend the use of the import validation tool for this import which you can find in https://github.com/datacommonsorg/tools/tree/master/import-validation-helper. diff --git a/scripts/world_bank/wdi/golden_data/golden_WorldBank.csv b/scripts/world_bank/wdi/golden_data/golden_WorldBank.csv new file mode 100644 index 0000000000..44f68b13a0 --- /dev/null +++ b/scripts/world_bank/wdi/golden_data/golden_WorldBank.csv @@ -0,0 +1,217 @@ +"ISO3166Alpha3" +"dcid:Earth" +"dcid:country/AGO" +"dcid:country/ALB" +"dcid:country/ARE" +"dcid:country/ARG" +"dcid:country/ARM" +"dcid:country/AUS" +"dcid:country/AUT" +"dcid:country/AZE" +"dcid:country/BEL" +"dcid:country/BEN" +"dcid:country/BFA" +"dcid:country/BGD" +"dcid:country/BGR" +"dcid:country/BHR" +"dcid:country/BIH" +"dcid:country/BLR" +"dcid:country/BOL" +"dcid:country/BRA" +"dcid:country/BRN" +"dcid:country/BWA" +"dcid:country/CAN" +"dcid:country/CHE" +"dcid:country/CHL" +"dcid:country/CHN" +"dcid:country/CIV" +"dcid:country/CMR" +"dcid:country/COD" +"dcid:country/COG" +"dcid:country/COL" +"dcid:country/CRI" +"dcid:country/CUB" +"dcid:country/CUW" +"dcid:country/CYP" +"dcid:country/CZE" +"dcid:country/DEU" +"dcid:country/DNK" +"dcid:country/DOM" +"dcid:country/DZA" +"dcid:country/ECU" +"dcid:country/EGY" +"dcid:country/ERI" +"dcid:country/ESP" +"dcid:country/EST" +"dcid:country/ETH" +"dcid:country/FIN" +"dcid:country/FRA" +"dcid:country/GAB" +"dcid:country/GBR" +"dcid:country/GEO" +"dcid:country/GHA" +"dcid:country/GIB" +"dcid:country/GNQ" +"dcid:country/GRC" +"dcid:country/GTM" +"dcid:country/HKG" +"dcid:country/HND" +"dcid:country/HRV" +"dcid:country/HTI" +"dcid:country/HUN" +"dcid:country/IDN" +"dcid:country/IND" +"dcid:country/IRL" +"dcid:country/IRN" +"dcid:country/IRQ" +"dcid:country/ISL" +"dcid:country/ISR" +"dcid:country/ITA" +"dcid:country/JAM" +"dcid:country/JOR" +"dcid:country/JPN" +"dcid:country/KAZ" +"dcid:country/KEN" +"dcid:country/KGZ" +"dcid:country/KHM" +"dcid:country/KOR" +"dcid:country/KWT" +"dcid:country/LAO" +"dcid:country/LBN" +"dcid:country/LBY" +"dcid:country/LKA" +"dcid:country/LTU" +"dcid:country/LUX" +"dcid:country/LVA" +"dcid:country/MAR" +"dcid:country/MDA" +"dcid:country/MDG" +"dcid:country/MEX" +"dcid:country/MKD" +"dcid:country/MLT" +"dcid:country/MMR" +"dcid:country/MNE" +"dcid:country/MNG" +"dcid:country/MOZ" +"dcid:country/MUS" +"dcid:country/MYS" +"dcid:country/NAM" +"dcid:country/NER" +"dcid:country/NGA" +"dcid:country/NIC" +"dcid:country/NLD" +"dcid:country/NOR" +"dcid:country/NPL" +"dcid:country/NZL" +"dcid:country/OMN" +"dcid:country/PAK" +"dcid:country/PAN" +"dcid:country/PER" +"dcid:country/PHL" +"dcid:country/POL" +"dcid:country/PRK" +"dcid:country/PRT" +"dcid:country/PRY" +"dcid:country/QAT" +"dcid:country/ROU" +"dcid:country/RUS" +"dcid:country/RWA" +"dcid:country/SAU" +"dcid:country/SDN" +"dcid:country/SEN" +"dcid:country/SGP" +"dcid:country/SLV" +"dcid:country/SRB" +"dcid:country/SSD" +"dcid:country/SUR" +"dcid:country/SVK" +"dcid:country/SVN" +"dcid:country/SWE" +"dcid:country/SWZ" +"dcid:country/SYR" +"dcid:country/TCD" +"dcid:country/TGO" +"dcid:country/THA" +"dcid:country/TJK" +"dcid:country/TKM" +"dcid:country/TTO" +"dcid:country/TUN" +"dcid:country/TUR" +"dcid:country/TZA" +"dcid:country/UGA" +"dcid:country/UKR" +"dcid:country/URY" +"dcid:country/USA" +"dcid:country/UZB" +"dcid:country/VEN" +"dcid:country/VNM" +"dcid:country/XKS" +"dcid:country/YEM" +"dcid:country/ZAF" +"dcid:country/ZMB" +"dcid:country/ZWE" +"dcid:country/ATG" +"dcid:country/BHS" +"dcid:country/BLZ" +"dcid:country/BRB" +"dcid:country/BTN" +"dcid:country/COM" +"dcid:country/CPV" +"dcid:country/DJI" +"dcid:country/DMA" +"dcid:country/FJI" +"dcid:country/GMB" +"dcid:country/GNB" +"dcid:country/GRD" +"dcid:country/GUY" +"dcid:country/KIR" +"dcid:country/KNA" +"dcid:country/LCA" +"dcid:country/LSO" +"dcid:country/MDV" +"dcid:country/MHL" +"dcid:country/PLW" +"dcid:country/SLB" +"dcid:country/STP" +"dcid:country/SYC" +"dcid:country/TLS" +"dcid:country/TON" +"dcid:country/VCT" +"dcid:country/VUT" +"dcid:country/WSM" +"dcid:ChannelIslands" +"dcid:country/ABW" +"dcid:country/AFG" +"dcid:country/AND" +"dcid:country/ASM" +"dcid:country/BDI" +"dcid:country/BMU" +"dcid:country/CAF" +"dcid:country/CYM" +"dcid:country/FRO" +"dcid:country/FSM" +"dcid:country/GIN" +"dcid:country/GRL" +"dcid:country/GUM" +"dcid:country/IMN" +"dcid:country/LBR" +"dcid:country/LIE" +"dcid:country/MAC" +"dcid:country/MAF" +"dcid:country/MCO" +"dcid:country/MLI" +"dcid:country/MNP" +"dcid:country/MRT" +"dcid:country/MWI" +"dcid:country/NCL" +"dcid:country/PNG" +"dcid:country/PRI" +"dcid:country/PSE" +"dcid:country/PYF" +"dcid:country/SLE" +"dcid:country/SMR" +"dcid:country/SOM" +"dcid:country/SXM" +"dcid:country/TCA" +"dcid:country/VIR" +"dcid:country/VGB" diff --git a/scripts/world_bank/wdi/golden_data/golden_summary_report.csv b/scripts/world_bank/wdi/golden_data/golden_summary_report.csv new file mode 100644 index 0000000000..cd7541999f --- /dev/null +++ b/scripts/world_bank/wdi/golden_data/golden_summary_report.csv @@ -0,0 +1,80 @@ +"StatVar","MinDate","Units","MeasurementMethods","observationPeriods" +"Count_Death_IntentionalSelfHarm_Male_AsFractionOf_Count_Person_Male","2000","[Per100000Males]","[]","[P1Y]" +"Amount_EconomicActivity_GrossNationalIncome_PurchasingPowerParity","1990","[InternationalDollar]","[]","[P1Y]" +"Count_Person_Upto4Years_Wasting_AsFractionOf_Count_Person_Upto4Years","1983","[Percent]","[JointChildMalnutritionEstimate]","[P1Y]" +"Count_Person_25OrMoreYears_DoctorateDegree_AsFractionOf_Count_Person_25OrMoreYears","1994","[]","[]","[P1Y]" +"Amount_Emissions_CarbonDioxide_PerCapita","1970","[MetricTon]","[]","[P1Y]" +"Count_Person_25OrMoreYears_Male_TertiaryEducation_AsFractionOf_Count_Person_25OrMoreYears_Male","1970","[]","[]","[P1Y]" +"LifeExpectancy_Person_Female","1960","[Year]","[]","[P1Y]" +"Count_Person_25OrMoreYears_Male_DoctorateDegree_AsFractionOf_Count_Person_25OrMoreYears_Male","1994","[]","[]","[P1Y]" +"Count_Death_0Years_Female_AsFractionOf_Count_BirthEvent_LiveBirth_Female","1960","[Per1000FemaleLiveBirths]","[UnitedNationsIGMEEstimate]","[P1Y]" +"Count_CriminalActivities_MurderAndNonNegligentManslaughter_AsFractionOf_Count_Person","1990","[Per100000Persons]","[]","[P1Y]" +"Amount_EconomicActivity_ExpenditureActivity_HealthcareExpenditure_AsFractionOf_Count_Person","2000","[InternationalDollar, USDollar]","[]","[P1Y]" +"Amount_EconomicActivity_ExpenditureActivity_EducationExpenditure_Government_AsFractionOf_Amount_EconomicActivity_ExpenditureActivity_Government","1980","[Percent]","[]","[P1Y]" +"Count_Person_25OrMoreYears_Male_BachelorsDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Male","1970","[]","[]","[P1Y]" +"FertilityRate_Person_Female","1960","[]","[]","[]" +"Count_Person_Rural","1960","[]","[WorldBankEstimate]","[P1Y]" +"Count_Person_25OrMoreYears_Female_TertiaryEducation_AsFractionOf_Count_Person_25OrMoreYears_Female","1970","[]","[]","[P1Y]" +"Count_Person_Urban","1960","[]","[WorldBankEstimate]","[P1Y]" +"Count_Person_Upto4Years_Overweight_AsFractionOf_Count_Person_Upto4Years","1983","[]","[]","[P1Y]" +"LifeExpectancy_Person_Male","1960","[Year]","[]","[P1Y]" +"Count_BirthEvent_LiveBirth_AsFractionOf_Count_Person","1960","[Per1000Persons]","[]","[P1Y]" +"MortalityRate_Person_Upto4Years_AsFractionOf_Count_BirthEvent_LiveBirth","1960","[Per1000LiveBirths]","[]","[P1Y]" +"Count_Person","1960","[]","[]","[P1Y]" +"Count_Person_7To14Years_Male_Employed_AsFractionOf_Count_Person_7To14Years_Male","1994","[Percent]","[]","[P1Y]" +"Count_Person_Upto4Years_Male_Wasting_AsFractionOf_Count_Person_Upto4Years_Male","1986","[Percent]","[JointChildMalnutritionEstimate]","[P1Y]" +"Amount_EconomicActivity_ExpenditureActivity_EducationExpenditure_Government_AsFractionOf_Amount_EconomicActivity_GrossDomesticProduction_Nominal","1970","[Percent]","[]","[P1Y]" +"Count_Person_25OrMoreYears_BachelorsDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears","1970","[]","[]","[P1Y]" +"Count_Person_15OrMoreYears_Female_Smoking_AsFractionOf_Count_Person_15OrMoreYears_Female","2000","[]","[AgeAdjustedPrevalence]","[P1Y]" +"Count_Person_15OrMoreYears_Smoking_AsFractionOf_Count_Person_15OrMoreYears","2000","[]","[AgeAdjustedPrevalence]","[P1Y]" +"Amount_EconomicActivity_GrossNationalIncome_PurchasingPowerParity_PerCapita","1990","[InternationalDollar]","[]","[P1Y]" +"Count_Person_Upto4Years_Male_Overweight_AsFractionOf_Count_Person_Upto4Years_Male","1986","[]","[]","[P1Y]" +"Count_Death_0Years","1960","[]","[UnitedNationsIGMEEstimate]","[P1Y]" +"Amount_EconomicActivity_ExpenditureActivity_TertiaryEducationExpenditure_Government_AsFractionOf_Amount_EconomicActivity_ExpenditureActivity_EducationExpenditure_Government","1970","[]","[]","[P1Y]" +"Count_Person_Upto4Years_Male_SevereWasting_AsFractionOf_Count_Person_Upto4Years_Male","1986","[Percent]","[JointChildMalnutritionEstimate]","[P1Y]" +"Amount_Consumption_Electricity_PerCapita","1990","[KilowattHour]","[]","[P1Y]" +"Count_Death_0Years_Male_AsFractionOf_Count_BirthEvent_LiveBirth_Male","1960","[Per1000MaleLiveBirths]","[UnitedNationsIGMEEstimate]","[P1Y]" +"Amount_Consumption_Energy_PerCapita","1990","[KilogramOfOilEquivalent]","[]","[P1Y]" +"Count_Death_IntentionalSelfHarm_Female_AsFractionOf_Count_Person_Female","2000","[Per100000Females]","[]","[P1Y]" +"Count_Person_15OrMoreYears_Male_Smoking_AsFractionOf_Count_Person_15OrMoreYears_Male","2000","[]","[AgeAdjustedPrevalence]","[P1Y]" +"Count_CriminalActivities_MurderAndNonNegligentManslaughter_Male_AsFractionOf_Count_Person_Male","1990","[Per100000Males]","[]","[P1Y]" +"Amount_Remittance_InwardRemittance_AsFractionOf_Amount_EconomicActivity_GrossDomesticProduction_Nominal","1970","[Percent]","[WorldBankEstimate]","[P1Y]" +"Count_Person_15To64Years_InLaborForce_AsFractionOf_Count_Person_15To64Years","1990","[]","[]","[P1Y]" +"Count_Person_7To14Years_Employed_AsFractionOf_Count_Person_7To14Years","1994","[Percent]","[]","[P1Y]" +"GiniIndex_EconomicActivity","1963","[]","[WorldBankEstimate]","[P1Y]" +"Count_Person_25OrMoreYears_Female_MastersDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Female","1990","[]","[]","[P1Y]" +"Count_Person_25OrMoreYears_MastersDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears","1990","[]","[]","[P1Y]" +"Count_CriminalActivities_MurderAndNonNegligentManslaughter_Female_AsFractionOf_Count_Person_Female","1990","[Per100000Females]","[]","[P1Y]" +"Count_Person_15To64Years_Female_InLaborForce_AsFractionOf_Count_Person_15To64Years_Female","1990","[]","[]","[P1Y]" +"Amount_Stock_AsFractionOf_Amount_EconomicActivity_GrossDomesticProduction_Nominal","1975","[Percent]","[]","[P1Y]" +"Count_Person_25OrMoreYears_Female_DoctorateDegree_AsFractionOf_Count_Person_25OrMoreYears_Female","1994","[]","[]","[P1Y]" +"GrowthRate_Amount_EconomicActivity_GrossDomesticProduction","1961","[]","[]","[P1Y]" +"Count_Death_AsAFractionOfCount_Person","1960","[Per1000Persons]","[WorldBankWeightedAverage]","[P1Y]" +"Amount_EconomicActivity_GrossDomesticProduction_Nominal","1960","[USDollar]","[]","[P1Y]" +"Count_Person_15To64Years_Male_InLaborForce_AsFractionOf_Count_Person_15To64Years_Male","1990","[]","[]","[P1Y]" +"Amount_Remittance_InwardRemittance","1970","[USDollar]","[WorldBankEstimate]","[P1Y]" +"Count_Person_Upto4Years_SevereWasting_AsFractionOf_Count_Person_Upto4Years","1983","[Percent]","[JointChildMalnutritionEstimate]","[P1Y]" +"Count_Person_25OrMoreYears_Female_BachelorsDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Female","1970","[]","[]","[P1Y]" +"Count_Person_7To14Years_Female_Employed_AsFractionOf_Count_Person_7To14Years_Female","1994","[Percent]","[]","[P1Y]" +"Count_Person_25OrMoreYears_Male_MastersDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Male","1990","[]","[]","[P1Y]" +"Amount_EconomicActivity_GrossDomesticProduction_Nominal_PerCapita","1960","[USDollar]","[]","[P1Y]" +"Amount_Consumption_Alcohol_15OrMoreYears_AsFractionOf_Count_Person_15OrMoreYears","2000","[Liter]","[WorldHealthOrganizationEstimates]","[P1Y]" +"Count_Person_15OrMoreYears_InLaborForce_Female_AsFractionOf_Count_Person_InLaborForce","1990","[]","[]","[P1Y]" +"Count_Person_ResidingLessThan5MetersAboveSeaLevel_AsFractionOf_Count_Person","1990","[]","[]","[P1Y]" +"Count_Product_MobileCellularSubscription_AsFractionOf_Count_Person","1960","[]","[]","[P1Y]" +"Count_Person_InLaborForce","1990","[]","[InternationalLaborOrganization]","[P1Y]" +"Count_Death_IntentionalSelfHarm_AsFractionOf_Count_Person","2000","[Per100000Persons]","[]","[P1Y]" +"Count_Death_0Years_AsFractionOf_Count_BirthEvent_LiveBirth","1960","[Per1000LiveBirths]","[UnitedNationsIGMEEstimate]","[P1Y]" +"Count_Person_Upto4Years_Female_Wasting_AsFractionOf_Count_Person_Upto4Years_Female","1986","[Percent]","[JointChildMalnutritionEstimate]","[P1Y]" +"Amount_Remittance_OutwardRemittance","1970","[USDollar]","[WorldBankEstimate]","[P1Y]" +"Count_Person_Upto4Years_Female_Overweight_AsFractionOf_Count_Person_Upto4Years_Female","1986","[]","[]","[P1Y]" +"Count_Person_IsInternetUser_PerCapita","1990","[]","[]","[P1Y]" +"Amount_Production_ElectricityFromNuclearSources_AsFractionOf_Amount_Production_Energy","1990","[]","[]","[P1Y]" +"Count_Person_Upto4Years_Female_SevereWasting_AsFractionOf_Count_Person_Upto4Years_Female","1986","[Percent]","[JointChildMalnutritionEstimate]","[P1Y]" +"Count_Person_25OrMoreYears_TertiaryEducation_AsFractionOf_Count_Person_25OrMoreYears","1970","[]","[]","[P1Y]" +"Amount_Production_ElectricityFromOilGasOrCoalSources_AsFractionOf_Amount_Production_Energy","1990","[]","[]","[P1Y]" +"GrowthRate_Count_Person","1961","[]","[]","[P1Y]" +"Amount_Consumption_RenewableEnergy_AsFractionOf_Amount_Consumption_Energy","1990","[]","[]","[P1Y]" +"Amount_Stock","1975","[USDollar]","[]","[P1Y]" +"LifeExpectancy_Person","1960","[Year]","[]","[]" +"Count_Person_20To79Years_Diabetes_AsFractionOf_Count_Person_20To79Years","2000","[]","[]","[P1Y]" diff --git a/scripts/world_bank/wdi/manifest.json b/scripts/world_bank/wdi/manifest.json index bc3927141e..eb427c0472 100644 --- a/scripts/world_bank/wdi/manifest.json +++ b/scripts/world_bank/wdi/manifest.json @@ -20,7 +20,8 @@ "WorldBankCountries.csv", "schema_csvs/WorldBankIndicators_prod.csv" ], - "cron_schedule": "0 11 * * 2" + "cron_schedule": "0 11 * * 2", + "validation_config_file": "validation_config.json" } ] } \ No newline at end of file diff --git a/scripts/world_bank/wdi/validation_config.json b/scripts/world_bank/wdi/validation_config.json new file mode 100644 index 0000000000..7a7a9c70e3 --- /dev/null +++ b/scripts/world_bank/wdi/validation_config.json @@ -0,0 +1,28 @@ +{ + "schema_version": "1.0", + "rules": [ + { + "rule_id": "check_deleted_records_percent", + "description": "Checks that the percentage of deleted points is within the threshold.", + "validator": "DELETED_RECORDS_PERCENT", + "params": { + "threshold": 0.1 + } + }, + { + "rule_id": "check_goldens_output_csv", + "validator": "GOLDENS_CHECK", + "params": { + "golden_files": "golden_data/golden_WorldBank.csv", + "input_files": "output/WorldBank.csv" + } + }, + { + "rule_id": "check_goldens_summary_report", + "validator": "GOLDENS_CHECK", + "params": { + "golden_files": "golden_data/golden_summary_report.csv" + } + } + ] +} \ No newline at end of file diff --git a/tools/import_validation/Validations.md b/tools/import_validation/Validations.md index 4efebb3a55..d46ece74fc 100644 --- a/tools/import_validation/Validations.md +++ b/tools/import_validation/Validations.md @@ -72,6 +72,8 @@ To generate goldens for the summary_report.csv to verify that all the expected StatVars are generated with the corresponding number of places and dates, run the following: +This will compare the golden files using summary_report.csv as the default input: + ```shell python3 validator_goldens.py \ --validate_goldens_input=summary_report.csv \