Spaces:
Build error
Build error
#!/usr/bin/env python | |
# | |
# File Name : ptbtokenizer.py | |
# | |
# Description : Do the PTB Tokenization and remove punctuations. | |
# | |
# Creation Date : 29-12-2014 | |
# Last Modified : Thu Mar 19 09:53:35 2015 | |
# Authors : Hao Fang <[email protected]> and Tsung-Yi Lin <[email protected]> | |
import os | |
import sys | |
import subprocess | |
import tempfile | |
import itertools | |
# path to the stanford corenlp jar | |
STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' | |
# punctuations to be removed from the sentences | |
PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ | |
".", "?", "!", ",", ":", "-", "--", "...", ";"] | |
class PTBTokenizer: | |
"""Python wrapper of Stanford PTBTokenizer""" | |
def tokenize(self, captions_for_image): | |
cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ | |
'edu.stanford.nlp.process.PTBTokenizer', \ | |
'-preserveLines', '-lowerCase'] | |
# ====================================================== | |
# prepare data for PTB Tokenizer | |
# ====================================================== | |
final_tokenized_captions_for_image = {} | |
image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] | |
sentences = '\n'.join([c.replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) | |
# ====================================================== | |
# save sentences to temporary file | |
# ====================================================== | |
path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) | |
tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname) | |
tmp_file.write(sentences.encode()) | |
tmp_file.close() | |
# ====================================================== | |
# tokenize sentence | |
# ====================================================== | |
cmd.append(os.path.basename(tmp_file.name)) | |
p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \ | |
stdout=subprocess.PIPE) | |
token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] | |
token_lines = token_lines.decode() | |
lines = token_lines.split('\n') | |
# remove temp file | |
os.remove(tmp_file.name) | |
# ====================================================== | |
# create dictionary for tokenized captions | |
# ====================================================== | |
for k, line in zip(image_id, lines): | |
if not k in final_tokenized_captions_for_image: | |
final_tokenized_captions_for_image[k] = [] | |
tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ | |
if w not in PUNCTUATIONS]) | |
final_tokenized_captions_for_image[k].append(tokenized_caption) | |
return final_tokenized_captions_for_image | |