File size: 2,315 Bytes
			
			| c9019cd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | # Copyright (c) 2022, National Diet Library, Japan
#
# This software is released under the CC BY 4.0.
# https://creativecommons.org/licenses/by/4.0/
import pathlib
import json
import lmdb
from xmlparser import XMLRawDataset, ListRawDataset
import argparse
class Env:
    def __init__(self, output_path, interval_writeCache=1000):
        self.output_path = output_path
        self.env = lmdb.open(str(output_path), map_size=1099511627776)
        self.cache = dict()
        self.n = 0
        self.interval = interval_writeCache
    def finish_line(self):
        self.n += 1
        if self.n % 1000 == 0:
            self.writeCache()
    def writeCache(self):
        with self.env.begin(write=True) as txn:
            for k, v in self.cache.items():
                txn.put(k, v)
        self.cache = {}
        print(f'Written {self.n} lines @ {self.output_path}')
def createDataset(input_path, output_path, db_type='xml', dry_run=False):
    p = pathlib.Path(output_path)
    p.mkdir(parents=True, exist_ok=True)
    if db_type == 'xml':
        generator = XMLRawDataset.from_list(input_path, image_type=XMLRawDataset.IMAGE_TYPE_ENCODED)
    elif db_type == 'list':
        generator = ListRawDataset(input_path, image_type=XMLRawDataset.IMAGE_TYPE_ENCODED)
    if dry_run:
        return
    # generate database
    env = Env(output_path[0])
    env.cache['dbtype'.encode()] = 'xml'.encode()
    for il, (g, line) in enumerate(generator):
        env.cache[f'{env.n:09d}-direction'.encode()] = line.get('direction').encode()
        env.cache[f'{env.n:09d}-label'.encode()] = line.get('label').encode()
        env.cache[f'{env.n:09d}-cattrs'.encode()] = json.dumps(line.get('cattrs')).encode()
        env.cache[f'{env.n:09d}-image'.encode()] = g
        env.finish_line()
    env.cache['n_line'.encode()] = str(env.n).encode()
    env.writeCache()
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_path', nargs='+', required=True)
    parser.add_argument('--output_path', required=True)
    parser.add_argument('--db_type', default='xml', choices=['xml', 'list'])
    parser.add_argument('--dry-run', action='store_true')
    opt = parser.parse_args()
    createDataset(opt.input_path, opt.output_path, opt.db_type, dry_run=opt.dry_run)
 |