Sigrid De los Santos
Remove remaining binary file for Hugging Face
9df4cc0
import pandas as pd
import easygui as gui
def find_different_rows():
# Prompt user to select CSV file
file_path = gui.fileopenbox("Select CSV file", filetypes=["*.csv"])
if file_path:
# Read CSV file using pandas
df = pd.read_csv(file_path)
# Ensure "link" column exists
if "link" not in df.columns:
gui.msgbox("'link' column is missing.")
return
# Find rows where "link" does not contain "http"
non_http_rows = df[~df['link'].str.contains("http", na=False)]
# Report the count of non-http links
if not non_http_rows.empty:
gui.msgbox("Total number of rows without 'http' in 'link' column: {}".format(len(non_http_rows)))
else:
gui.msgbox("No rows found without 'http' in 'link' column.")
if file_path:
# Read CSV file using pandas
df = pd.read_csv(file_path)
# Ensure "text" and "contextualized sentences" columns exist
if "text" not in df.columns or "contextualized_sentence" not in df.columns:
gui.msgbox("Either or both 'text' and 'contextualized_sentences' columns are missing.")
return
# Find rows where "text" and "contextualized sentences" values are different
different_rows = df[df['text'] != df['contextualized_sentence']]
# Report the different row indexes
if not different_rows.empty:
gui.msgbox("total number is {}".format(len(different_rows.index.tolist())))
else:
gui.msgbox("No rows found with different values for 'text' and 'contextualized_sentences'.")
else:
gui.msgbox("No file selected.")
if __name__ == '__main__':
find_different_rows()