Spaces:
Running
Running
import re | |
import os | |
from transformers import BertTokenizer | |
from datasketch import MinHash, MinHashLSH | |
from nltk import ngrams | |
# junk data | |
def junk_eliminate(df, re_expression = r'[&#<>{}\[\]\\]', threshold=0.01, min_len=10): | |
RE_SUSPICIOUS = re.compile(re_expression) | |
def impurity(text, min_len=min_len): | |
"""returns the share of suspicious characters in a text""" | |
if text == None or len(text) < min_len: | |
return 0 | |
else: | |
return len(RE_SUSPICIOUS.findall(text))/len(text) | |
df['impurity'] = df['text'].apply(impurity, min_len=min_len) | |
total_num_docs = len(df) | |
impurity_num_docs = len(df[df['impurity'] >= threshold]) | |
impurity_ratio = impurity_num_docs / total_num_docs | |
purity_df = df[df['impurity'] < threshold] | |
return purity_df, impurity_ratio | |
# Biased Content | |
def toxic_eliminate(df, l_kind='en'): | |
''' | |
l_kind = ['en', 'zh'] | |
''' | |
os.system(f"wget https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/{l_kind}") | |
with open(f'./{l_kind}', 'r') as f: | |
lines = f.readlines() | |
banned_words = set([line.rstrip('\n') for line in lines]) | |
df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()]) | |
df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0) | |
total_num_docs = len(df) | |
biased_num_docs = df['matches'].sum() | |
biased_content_ratio = biased_num_docs / total_num_docs | |
non_toxic_df = df[df['matches'] == 0] | |
return non_toxic_df, biased_content_ratio | |
# Too Short Document | |
def short_eliminate(df, tokenizer = BertTokenizer.from_pretrained('bert-base-uncased'), min_len=100): | |
# Create a new column with the number of tokens for each text | |
df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text))) | |
total_num_docs = len(df) | |
too_short_docs = len(df[df['text_length'] <= min_len]) | |
too_short_doc_ratio = too_short_docs / total_num_docs | |
not_short_df = df[df['text_length'] > min_len] | |
return not_short_df, too_short_doc_ratio | |
# Contamination | |
def process_data(df): | |
minhashes = {} | |
for idx, text in enumerate(df['text']): | |
minhash = MinHash(num_perm=128) | |
for d in ngrams(text, 13): | |
s = "".join(d).encode('utf-8') | |
minhash.update(s) | |
minhashes[idx] = minhash | |
return minhashes | |
def contamination_eliminate(train_dataset, test_dataset): | |
train_minhashes = process_data(train_dataset) | |
test_minhashes = process_data(test_dataset) | |
lsh = MinHashLSH(threshold=0.8, num_perm=128) | |
for idx, minhash in train_minhashes.items(): | |
lsh.insert(idx, minhash) | |
duplicates_count = 0 | |
for idx, minhash in test_minhashes.items(): | |
result = lsh.query(minhash) | |
if len(result) > 0: | |
duplicates_count += 1 | |
contamination_ratio = duplicates_count / len(test_dataset) | |
return contamination_ratio | |
# Duplication | |
def duplication_eliminate(df): | |
lsh = MinHashLSH(threshold=0.85, num_perm=128) | |
for i, text in enumerate(df['text']): | |
minhash = MinHash(num_perm=128) | |
for word in text.split(): | |
minhash.update(word.encode('utf-8')) | |
lsh.insert(str(i), minhash) | |
unique_documents = set() | |
for i, text in enumerate(df['text']): | |
query_minhash = MinHash(num_perm=128) | |
for word in text.split(): | |
query_minhash.update(word.encode('utf-8')) | |
results = lsh.query(query_minhash) | |
try: | |
unique_documents.add(results[0]) | |
except Exception as e: | |
print(f'error: {e}') | |
total_unique_documents = len(unique_documents) | |
total_documents = len(df) | |
duplication_ratio = (total_documents - total_unique_documents) / total_documents | |
return unique_documents, duplication_ratio | |