diff --git a/country/airflow_upload_to_data_lake.py b/country/airflow_upload_to_data_lake.py new file mode 100644 index 0000000..1274bbe --- /dev/null +++ b/country/airflow_upload_to_data_lake.py @@ -0,0 +1,36 @@ +import argparse +import os + +import wmfdata as wmf + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--database", + help="Name of the database to put the table countres into", + default="canonical_data", + ) + parser.add_argument("--data_file", help="TSV table", default="countries.tsv") + parser.add_argument( + "--create_table_statement", + help="TSV table", + default="create_canonical_data_countries_table.hql", + ) + args = parser.parse_args() + + spark = wmf.spark.get_session(type="local") + + cwd = os.getcwd() + df = spark.read.csv(f"file:///{cwd}/{args.data_file}", header=True) + print(f"Filling {args.database}.countries with {df.count()} line(s)") + + query = f"use {args.database};\n" + open(args.create_table_statement).read() + print(query) + spark.sql(query) + + df.write.mode("overwrite").saveAsTable(args.table) + + +if __name__ == "__main__": + main() diff --git a/country/create_canonical_data_countries_table.hql b/country/create_canonical_data_countries_table.hql new file mode 100644 index 0000000..f2842b7 --- /dev/null +++ b/country/create_canonical_data_countries_table.hql @@ -0,0 +1,23 @@ +-- Create table statement for an static table about countries. +-- +-- This table belongs to analytics-product +-- +-- Parameters: +-- +-- +-- Usage +-- spark3-sql \ +-- -f create_countries_table.hql \ +-- --database canonical_data +-- + +CREATE TABLE IF NOT EXISTS `countries` ( + name STRING COMMENT 'Country name, aligned with the article on English Wikipedia', + iso_code STRING COMMENT 'ISO 3166-1 two-letter country code', + economic_region STRING COMMENT 'Global South/North, according to [[en:Global North and Global South]]', + maxmind_continent STRING COMMENT 'Continent, according to MaxMind databases', + is_protected BOOLEAN COMMENT 'Whether the country appears in [[wikitech:Country_protection_list]]', + is_eu BOOLEAN COMMENT 'Whether the country belongs to the European Union' +) +COMMENT 'Metadata information about countries we release data about.' +USING parquet;