Sigrid De los Santos
Remove remaining binary file for Hugging Face
9df4cc0
raw
history blame
3.88 kB
import re
import os
from transformers import BertTokenizer
from datasketch import MinHash, MinHashLSH
from nltk import ngrams
# junk data
def junk_eliminate(df, re_expression = r'[&#<>{}\[\]\\]', threshold=0.01, min_len=10):
RE_SUSPICIOUS = re.compile(re_expression)
def impurity(text, min_len=min_len):
"""returns the share of suspicious characters in a text"""
if text == None or len(text) < min_len:
return 0
else:
return len(RE_SUSPICIOUS.findall(text))/len(text)
df['impurity'] = df['text'].apply(impurity, min_len=min_len)
total_num_docs = len(df)
impurity_num_docs = len(df[df['impurity'] >= threshold])
impurity_ratio = impurity_num_docs / total_num_docs
purity_df = df[df['impurity'] < threshold]
return purity_df, impurity_ratio
# Biased Content
def toxic_eliminate(df, l_kind='en'):
'''
l_kind = ['en', 'zh']
'''
os.system(f"wget https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/{l_kind}")
with open(f'./{l_kind}', 'r') as f:
lines = f.readlines()
banned_words = set([line.rstrip('\n') for line in lines])
df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
total_num_docs = len(df)
biased_num_docs = df['matches'].sum()
biased_content_ratio = biased_num_docs / total_num_docs
non_toxic_df = df[df['matches'] == 0]
return non_toxic_df, biased_content_ratio
# Too Short Document
def short_eliminate(df, tokenizer = BertTokenizer.from_pretrained('bert-base-uncased'), min_len=100):
# Create a new column with the number of tokens for each text
df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
total_num_docs = len(df)
too_short_docs = len(df[df['text_length'] <= min_len])
too_short_doc_ratio = too_short_docs / total_num_docs
not_short_df = df[df['text_length'] > min_len]
return not_short_df, too_short_doc_ratio
# Contamination
def process_data(df):
minhashes = {}
for idx, text in enumerate(df['text']):
minhash = MinHash(num_perm=128)
for d in ngrams(text, 13):
s = "".join(d).encode('utf-8')
minhash.update(s)
minhashes[idx] = minhash
return minhashes
def contamination_eliminate(train_dataset, test_dataset):
train_minhashes = process_data(train_dataset)
test_minhashes = process_data(test_dataset)
lsh = MinHashLSH(threshold=0.8, num_perm=128)
for idx, minhash in train_minhashes.items():
lsh.insert(idx, minhash)
duplicates_count = 0
for idx, minhash in test_minhashes.items():
result = lsh.query(minhash)
if len(result) > 0:
duplicates_count += 1
contamination_ratio = duplicates_count / len(test_dataset)
return contamination_ratio
# Duplication
def duplication_eliminate(df):
lsh = MinHashLSH(threshold=0.85, num_perm=128)
for i, text in enumerate(df['text']):
minhash = MinHash(num_perm=128)
for word in text.split():
minhash.update(word.encode('utf-8'))
lsh.insert(str(i), minhash)
unique_documents = set()
for i, text in enumerate(df['text']):
query_minhash = MinHash(num_perm=128)
for word in text.split():
query_minhash.update(word.encode('utf-8'))
results = lsh.query(query_minhash)
try:
unique_documents.add(results[0])
except Exception as e:
print(f'error: {e}')
total_unique_documents = len(unique_documents)
total_documents = len(df)
duplication_ratio = (total_documents - total_unique_documents) / total_documents
return unique_documents, duplication_ratio