Spaces:
Running
Running
File size: 3,875 Bytes
9df4cc0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import re
import os
from transformers import BertTokenizer
from datasketch import MinHash, MinHashLSH
from nltk import ngrams
# junk data
def junk_eliminate(df, re_expression = r'[&#<>{}\[\]\\]', threshold=0.01, min_len=10):
RE_SUSPICIOUS = re.compile(re_expression)
def impurity(text, min_len=min_len):
"""returns the share of suspicious characters in a text"""
if text == None or len(text) < min_len:
return 0
else:
return len(RE_SUSPICIOUS.findall(text))/len(text)
df['impurity'] = df['text'].apply(impurity, min_len=min_len)
total_num_docs = len(df)
impurity_num_docs = len(df[df['impurity'] >= threshold])
impurity_ratio = impurity_num_docs / total_num_docs
purity_df = df[df['impurity'] < threshold]
return purity_df, impurity_ratio
# Biased Content
def toxic_eliminate(df, l_kind='en'):
'''
l_kind = ['en', 'zh']
'''
os.system(f"wget https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/{l_kind}")
with open(f'./{l_kind}', 'r') as f:
lines = f.readlines()
banned_words = set([line.rstrip('\n') for line in lines])
df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
total_num_docs = len(df)
biased_num_docs = df['matches'].sum()
biased_content_ratio = biased_num_docs / total_num_docs
non_toxic_df = df[df['matches'] == 0]
return non_toxic_df, biased_content_ratio
# Too Short Document
def short_eliminate(df, tokenizer = BertTokenizer.from_pretrained('bert-base-uncased'), min_len=100):
# Create a new column with the number of tokens for each text
df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
total_num_docs = len(df)
too_short_docs = len(df[df['text_length'] <= min_len])
too_short_doc_ratio = too_short_docs / total_num_docs
not_short_df = df[df['text_length'] > min_len]
return not_short_df, too_short_doc_ratio
# Contamination
def process_data(df):
minhashes = {}
for idx, text in enumerate(df['text']):
minhash = MinHash(num_perm=128)
for d in ngrams(text, 13):
s = "".join(d).encode('utf-8')
minhash.update(s)
minhashes[idx] = minhash
return minhashes
def contamination_eliminate(train_dataset, test_dataset):
train_minhashes = process_data(train_dataset)
test_minhashes = process_data(test_dataset)
lsh = MinHashLSH(threshold=0.8, num_perm=128)
for idx, minhash in train_minhashes.items():
lsh.insert(idx, minhash)
duplicates_count = 0
for idx, minhash in test_minhashes.items():
result = lsh.query(minhash)
if len(result) > 0:
duplicates_count += 1
contamination_ratio = duplicates_count / len(test_dataset)
return contamination_ratio
# Duplication
def duplication_eliminate(df):
lsh = MinHashLSH(threshold=0.85, num_perm=128)
for i, text in enumerate(df['text']):
minhash = MinHash(num_perm=128)
for word in text.split():
minhash.update(word.encode('utf-8'))
lsh.insert(str(i), minhash)
unique_documents = set()
for i, text in enumerate(df['text']):
query_minhash = MinHash(num_perm=128)
for word in text.split():
query_minhash.update(word.encode('utf-8'))
results = lsh.query(query_minhash)
try:
unique_documents.add(results[0])
except Exception as e:
print(f'error: {e}')
total_unique_documents = len(unique_documents)
total_documents = len(df)
duplication_ratio = (total_documents - total_unique_documents) / total_documents
return unique_documents, duplication_ratio
|