File size: 1,743 Bytes
9df4cc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
import easygui as gui

def find_different_rows():
    # Prompt user to select CSV file
    file_path = gui.fileopenbox("Select CSV file", filetypes=["*.csv"])

    if file_path:
        # Read CSV file using pandas
        df = pd.read_csv(file_path)

        # Ensure "link" column exists
        if "link" not in df.columns:
            gui.msgbox("'link' column is missing.")
            return

        # Find rows where "link" does not contain "http"
        non_http_rows = df[~df['link'].str.contains("http", na=False)]

        # Report the count of non-http links
        if not non_http_rows.empty:
            gui.msgbox("Total number of rows without 'http' in 'link' column: {}".format(len(non_http_rows)))
        else:
            gui.msgbox("No rows found without 'http' in 'link' column.")

    if file_path:
        # Read CSV file using pandas
        df = pd.read_csv(file_path)

        # Ensure "text" and "contextualized sentences" columns exist
        if "text" not in df.columns or "contextualized_sentence" not in df.columns:
            gui.msgbox("Either or both 'text' and 'contextualized_sentences' columns are missing.")
            return

        # Find rows where "text" and "contextualized sentences" values are different
        different_rows = df[df['text'] != df['contextualized_sentence']]

        # Report the different row indexes
        if not different_rows.empty:
            gui.msgbox("total number is {}".format(len(different_rows.index.tolist())))
        else:
            gui.msgbox("No rows found with different values for 'text' and 'contextualized_sentences'.")

    else:
        gui.msgbox("No file selected.")

if __name__ == '__main__':
    find_different_rows()