|
import duckdb |
|
import yaml |
|
import time |
|
|
|
def extract_base_models(tags): |
|
base_models = set() |
|
for tag in tags: |
|
if tag.startswith("base_model:"): |
|
base_model = tag.split(":")[-1] |
|
base_model = ''.join(c for c in base_model if c.isalnum() or c in '/_.-') |
|
base_models.add(base_model) |
|
return list(base_models) |
|
|
|
|
|
con = duckdb.connect() |
|
|
|
|
|
con.create_function("extract_base_models", extract_base_models, return_type="VARCHAR[]") |
|
|
|
|
|
query = """ |
|
SELECT |
|
_id, |
|
id, |
|
extract_base_models(tags) AS base_models |
|
FROM parquet_scan('tables/models.parquet') |
|
""" |
|
|
|
start_time = time.time() |
|
|
|
|
|
con.execute(f"CREATE VIEW parent_models AS {query}") |
|
|
|
|
|
con.execute("COPY parent_models TO 'tables/parents.parquet' (FORMAT 'parquet')") |
|
|
|
end_time = time.time() |
|
execution_time = end_time - start_time |
|
|
|
print(f"Query execution time: {execution_time:.2f} seconds") |
|
|
|
|
|
con.execute(""" |
|
CREATE VIEW non_empty_base_models AS |
|
SELECT * |
|
FROM parent_models |
|
WHERE ARRAY_LENGTH(base_models) > 0 |
|
""") |
|
|
|
|
|
result = con.execute(""" |
|
SELECT _id, id, base_models |
|
FROM non_empty_base_models |
|
ORDER BY RANDOM() |
|
LIMIT 10 |
|
""").fetchall() |
|
|
|
with open("tables/parents.example.yaml", "w") as f: |
|
yaml.safe_dump(result, f, default_flow_style=False) |
|
|