Spaces:
Sleeping
Sleeping
Many changes to code organisation. More efficient searches from using intermediate outputs. Version 0.1
99d6fba
| # %% | |
| import pandas as pd | |
| import csv | |
| # %% | |
| # Define your file paths | |
| file_dir = "../" | |
| extracted_file_path = file_dir + "2022_08_case_notes.txt" | |
| parquet_file_path = file_dir + "2022_08_case_notes.parquet" | |
| # %% | |
| # Read the TXT file using the csv module and convert to DataFrame | |
| csv.field_size_limit(1000000) # set to a higher value | |
| data_list = [] | |
| with open(extracted_file_path, mode='r', encoding='iso-8859-1') as file: | |
| csv_reader = csv.reader(file, delimiter=',') # Change the delimiter if needed | |
| for row in csv_reader: | |
| data_list.append(row) | |
| # Filter rows that have the same number of columns as the header | |
| header = data_list[0] | |
| filtered_data = [row for row in data_list if len(row) == len(header)] | |
| # Convert list of rows to DataFrame | |
| casenotes = pd.DataFrame(filtered_data[1:], columns=header) # Assuming first row is header | |
| print(casenotes.head()) # Display the first few rows of the DataFrame | |
| # %% | |
| casenotes.to_parquet(parquet_file_path) | |