Spaces:

Tzktz
/

Dit-document-layout-analysis

Sleeping

File size: 8,834 Bytes

6fc683c

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import random
import json
import pickle
from tqdm import tqdm
import os
import numpy as np


class CaptionDedupProcessor(object):
    """remove overlapping of caption sentences(clip).
    Some statistics:
    caption:
    {'t_clip_len': 246.6448431320854,
    'video_len': 281.09174795676245,
    'clip_tps': 0.8841283727427481,
    'video_tps': 0.7821156477732097,
    'min_clip_len': 0.0,
    'max_clip_len': 398.3,
    'mean_clip_len': 3.196580003006861,
    'num_clip': 77.15897706301081}

    raw_caption:
    {'t_clip_len': 238.95908778424115,
    'video_len': 267.5914859862507,
    'clip_tps': 2.4941363624267963,
    'video_tps': 2.258989769647173,
    'min_clip_len': 0.0,
    'max_clip_len': 398.3,
    'mean_clip_len': 3.0537954186814265,
    'num_clip': 78.24986779481756}
    """

    def __init__(self, pkl_file):
        with open(pkl_file, "rb") as fd:
            self.data = pickle.load(fd)
        self.stat = {
            "t_clip_len": [],
            "video_len": [],
            "clip_tps": [],
            "video_tps": [],
            "clip_len": [],
        }

    def __call__(self):
        for idx, video_id in enumerate(tqdm(self.data)):
            caption = json.loads(self.data[video_id])
            caption = self._dedup(caption)
            if idx < 4096:  # for the first 4096 examples, compute the statistics.
                self.save_stat(video_id, caption)
            self.data[video_id] = json.dumps(caption)
        self.print_stat()

    def single(self, video_id):
        caption = json.loads(self.data[video_id])
        for clip_idx, (start, end, text) in enumerate(
            zip(caption["start"], caption["end"], caption["text"])
        ):
            print(start, end, text)
        print("@" * 100)
        caption = self._dedup(caption)
        for clip_idx, (start, end, text) in enumerate(
            zip(caption["start"], caption["end"], caption["text"])
        ):
            print(start, end, text)
        print("#" * 100)
        self.save_stat(video_id, caption)
        self.print_stat()

    def finalize(self, tgt_fn):
        with open(tgt_fn, "wb") as fw:
            pickle.dump(self.data, fw, pickle.HIGHEST_PROTOCOL)

    def save_stat(self, video_id, caption):
        video_fn = os.path.join(
            "data/feat/feat_how2_s3d", video_id + ".npy"
        )
        if os.path.isfile(video_fn):
            with open(video_fn, "rb", 1) as fr:  # 24 is the buffer size. buffered
                version = np.lib.format.read_magic(fr)
                shape, fortran, dtype = np.lib.format._read_array_header(fr, version)
                video_len = shape[0]

            t_clip_len = 0.0
            t_tokens = 0
            for idx, (start, end, text) in enumerate(
                zip(caption["start"], caption["end"], caption["text"])
            ):
                clip_len = (
                    (end - max(caption["end"][idx - 1], start))
                    if idx > 0
                    else end - start
                )
                t_clip_len += clip_len
                t_tokens += len(text.split(" "))
                self.stat["clip_len"].append(clip_len)
            self.stat["t_clip_len"].append(t_clip_len)
            self.stat["video_len"].append(video_len)
            self.stat["clip_tps"].append(t_tokens / t_clip_len)
            self.stat["video_tps"].append(t_tokens / video_len)

    def print_stat(self):
        result = {
            "t_clip_len": np.mean(self.stat["t_clip_len"]),
            "video_len": np.mean(self.stat["video_len"]),
            "clip_tps": np.mean(self.stat["clip_tps"]),
            "video_tps": np.mean(self.stat["video_tps"]),
            "min_clip_len": min(self.stat["clip_len"]),
            "max_clip_len": max(self.stat["clip_len"]),
            "mean_clip_len": np.mean(self.stat["clip_len"]),
            "num_clip": len(self.stat["clip_len"]) / len(self.stat["video_tps"]),
        }
        print(result)

    def _dedup(self, caption):
        def random_merge(end_idx, start, end, text, starts, ends, texts):
            if random.random() > 0.5:
                # print(clip_idx, "[PARTIAL INTO PREV]", end_idx)
                # overlapped part goes to the end of previous.
                ends[-1] = max(ends[-1], start)  # ?
                rest_text = text[end_idx:].strip()
                if rest_text:
                    starts.append(max(ends[-1], start))
                    ends.append(max(end, starts[-1]))
                    texts.append(rest_text)
            else:  # goes to the beginning of the current.
                # strip the previous.
                left_text = texts[-1][:-end_idx].strip()
                if left_text:
                    # print(clip_idx, "[PREV PARTIAL INTO CUR]", end_idx)
                    ends[-1] = min(ends[-1], start)
                    texts[-1] = left_text
                else:
                    # print(clip_idx, "[PREV LEFT NOTHING ALL INTO CUR]", end_idx)
                    starts.pop(-1)
                    ends.pop(-1)
                    texts.pop(-1)
                starts.append(start)
                ends.append(end)
                texts.append(text)

        starts, ends, texts = [], [], []
        for clip_idx, (start, end, text) in enumerate(
            zip(caption["start"], caption["end"], caption["text"])
        ):
            if not isinstance(text, str):
                continue
            text = text.replace("\n", " ").strip()
            if len(text) == 0:
                continue
            starts.append(start)
            ends.append(end)
            texts.append(text)
            break

        for clip_idx, (start, end, text) in enumerate(
            zip(
                caption["start"][clip_idx + 1:],
                caption["end"][clip_idx + 1:],
                caption["text"][clip_idx + 1:],
            )
        ):
            if not isinstance(text, str):
                continue
            text = text.replace("\n", " ").strip()
            if len(text) == 0:
                continue

            # print(clip_idx, texts[-5:])
            # print(clip_idx, start, end, text)
            if texts[-1].endswith(text):  # subset of prev caption -> merge
                # print(clip_idx, "[MERGE INTO PREV]")
                ends[-1] = max(ends[-1], end)
            elif text.startswith(texts[-1]):  # superset of prev caption -> merge
                # print(clip_idx, "[PREV MERGE INTO CUR]")
                texts[-1] = text
                starts[-1] = min(starts[-1], start)
                ends[-1] = max(ends[-1], end)
            else:  # overlapping or non-overlapping.
                for end_idx in range(1, len(text) + 1):
                    if texts[-1].endswith(text[:end_idx]):
                        random_merge(end_idx, start, end, text, starts, ends, texts)
                        break
                else:
                    starts.append(start)
                    ends.append(end)
                    texts.append(text)

            assert (ends[-1] + 0.001) >= starts[-1] and len(
                texts[-1]
            ) > 0, "{} {} {} <- {} {} {}, {} {} {}".format(
                str(starts[-1]),
                str(ends[-1]),
                texts[-1],
                caption["start"][clip_idx - 1],
                caption["end"][clip_idx - 1],
                caption["text"][clip_idx - 1],
                str(start),
                str(end),
                text,
            )

        return {"start": starts, "end": ends, "text": texts}


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="dedup how2 caption")
    parser.add_argument('--how2dir', default="data/how2")
    args = parser.parse_args()

    raw_caption_json = os.path.join(args.how2dir, "raw_caption.json")
    raw_caption_pickle = os.path.join(args.how2dir, "raw_caption.pkl")
    raw_caption_dedup_pickle = os.path.join(args.how2dir, "raw_caption_dedup.pkl")

    def convert_to_pickle(src_fn, tgt_fn):
        with open(src_fn) as fd:
            captions = json.load(fd)

        for video_id in captions:
            captions[video_id] = json.dumps(captions[video_id])

        with open(tgt_fn, "wb") as fw:
            pickle.dump(captions, fw, pickle.HIGHEST_PROTOCOL)

    if not os.path.isfile(raw_caption_pickle):
        convert_to_pickle(raw_caption_json, raw_caption_pickle)

    deduper = CaptionDedupProcessor(raw_caption_pickle)
    deduper()
    deduper.finalize(raw_caption_dedup_pickle)

    """
    # demo
    deduper = CaptionDedupProcessor("data/how2/raw_caption.pkl")
    deduper.single("HfIeQ9pzL5U")
    """