postgres • minio • dbt • lightsail • caddy
cat Dockerfile
FROM python:3.11-slim
RUN pip install duckdb pandas
WORKDIR /data
ENTRYPOINT ["python"]
input - world bank dataset
'Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1960' ... '2024' ... 'column69'
"Aruba","ABW","Population, total","SP.POP.TOTL","54922" ... "56320",""
...
import duckdb
def unpivot_via_pandas():
assert False
df = duckdb.query("SELECT * FROM '/data/countrypopyear.csv'").to_df()
cc = df.columns.tolist()
excluded = lambda cx : 'Name' in cx or 'Code' in cx or 'column' in cx
unpivot_nonmeasures = lambda cx : 'Name' in cx or 'Code' in cx
ccyyyy = [x for x in cc if not excluded(x)]
nonms = [x for x in cc if unpivot_nonmeasures(x)]
print(ccyyyy)
print(nonms)
ccyyyylist = ','.join(ccyyyy)
# ... etc.
return None
# https://duckdb.org/docs/stable/sql/statements/unpivot
# https://duckdb.org/docs/stable/sql/expressions/star.html#columns-expression
ctas = """
CREATE TABLE poppivot AS SELECT
COLUMNS(c -> c LIKE '%Name' OR c LIKE '%Code'),
CAST(COLUMNS(c -> c LIKE '____') AS BIGINT)
FROM read_csv_auto('/data/countrypopyear.csv');
"""
upsql = """
UNPIVOT poppivot
ON COLUMNS(c -> c NOT LIKE '%Code' AND c NOT LIKE '%Name')
INTO
NAME year
VALUE population;
"""
reference_sql_1 = """
UNPIVOT monthly_sales
ON jan, feb, mar, apr, may, jun
INTO
NAME month
VALUE sales;
"""
duckdb.execute(ctas)
df = duckdb.execute("SELECT * FROM poppivot").fetchdf()
dfu = duckdb.query(upsql).df()
print('max')
print(dfu.max(axis=1,numeric_only=True))
max_value = df.select_dtypes(include='number').max().max()
print("Max value:", max_value)
dfu.columns = dfu.columns.str.replace(' ', '_')
dfu.columns = [x.lower() for x in dfu.columns]
dfu.to_csv('unpivot_country_popyear.csv', index=False)
output:
country_name,country_code,indicator_name,indicator_code,year,population
Aruba,ABW,"Population, total",SP.POP.TOTL,1960,5492