Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Split images into small patches and insert them into sqlite db. Reading and Inserting speeds are much better than | |
| Ubuntu's (18.04) file system when the number of patches is larger than 20k. And it has smaller size than using h5 format | |
| Recommend to check or filter out small size patches as their content vary little. 128x128 seems better than 64x64. | |
| """ | |
| import sqlite3 | |
| from torch.utils.data import DataLoader | |
| from tqdm import trange | |
| from Dataloader import Image2Sqlite | |
| conn = sqlite3.connect("dataset/image_yandere.db") | |
| cursor = conn.cursor() | |
| with conn: | |
| cursor.execute("PRAGMA SYNCHRONOUS = OFF") | |
| table_name = "train_images_size_128_noise_1_rgb" | |
| lr_col = "lr_img" | |
| hr_col = "hr_img" | |
| with conn: | |
| conn.execute( | |
| f"CREATE TABLE IF NOT EXISTS {table_name} ({lr_col} BLOB, {hr_col} BLOB)" | |
| ) | |
| dat = Image2Sqlite( | |
| img_folder="./dataset/yande.re_test_shrink", | |
| patch_size=256, | |
| shrink_size=2, | |
| noise_level=1, | |
| down_sample_method=None, | |
| color_mod="RGB", | |
| dummy_len=None, | |
| ) | |
| print(f"Total images {len(dat)}") | |
| img_dat = DataLoader(dat, num_workers=6, batch_size=6, shuffle=True) | |
| num_batches = 20 | |
| for i in trange(num_batches): | |
| bulk = [] | |
| for lrs, hrs in img_dat: | |
| patches = [(lrs[i], hrs[i]) for i in range(len(lrs))] | |
| # patches = [(lrs[i], hrs[i]) for i in range(len(lrs)) if len(lrs[i]) > 14000] | |
| bulk.extend(patches) | |
| bulk = [ | |
| i for i in bulk if len(i[0]) > 15000 | |
| ] # for 128x128, 14000 is fair. Around 20% of patches are filtered out | |
| cursor.executemany( | |
| f"INSERT INTO {table_name}({lr_col}, {hr_col}) VALUES (?,?)", bulk | |
| ) | |
| conn.commit() | |
| cursor.execute(f"select max(rowid) from {table_name}") | |
| print(cursor.fetchall()) | |
| conn.commit() | |
| # +++++++++++++++++++++++++++++++++++++ | |
| # Used for Create Test Database | |
| # ------------------------------------- | |
| # cursor.execute(f"SELECT ROWID FROM {table_name} ORDER BY LENGTH({lr_col}) DESC LIMIT 400") | |
| # rowdis = cursor.fetchall() | |
| # rowdis = ",".join([str(i[0]) for i in rowdis]) | |
| # | |
| # cursor.execute(f"DELETE FROM {table_name} WHERE ROWID NOT IN ({rowdis})") | |
| # conn.commit() | |
| # cursor.execute("vacuum") | |
| # | |
| # cursor.execute(""" | |
| # CREATE TABLE IF NOT EXISTS train_images_size_128_noise_1_rgb_small AS | |
| # SELECT * | |
| # FROM train_images_size_128_noise_1_rgb | |
| # WHERE length(lr_img) < 14000; | |
| # """) | |
| # | |
| # cursor.execute(""" | |
| # DELETE | |
| # FROM train_images_size_128_noise_1_rgb | |
| # WHERE length(lr_img) < 14000; | |
| # """) | |
| # reset index | |
| cursor.execute("VACUUM") | |
| conn.commit() | |
| # +++++++++++++++++++++++++++++++++++++ | |
| # check image size | |
| # ------------------------------------- | |
| # | |
| from PIL import Image | |
| import io | |
| cursor.execute( | |
| f""" | |
| select {hr_col} from {table_name} | |
| ORDER BY LENGTH({hr_col}) desc | |
| limit 100 | |
| """ | |
| ) | |
| # WHERE LENGTH({lr_col}) BETWEEN 14000 AND 16000 | |
| # small = cursor.fetchall() | |
| # print(len(small)) | |
| for idx, i in enumerate(cursor): | |
| img = Image.open(io.BytesIO(i[0])) | |
| img.save(f"dataset/check/{idx}.png") | |
| # +++++++++++++++++++++++++++++++++++++ | |
| # Check Image Variance | |
| # ------------------------------------- | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| dat = pd.read_sql(f"SELECT length({lr_col}) from {table_name}", conn) | |
| dat.hist(bins=20) | |
| plt.show() | |