Spaces:

sartajbhuvaji
/

resonate

Build error

File size: 6,053 Bytes

5f773d1

# Uploads data to pinecone
# Runner: python init_one_time_utils/pinecone_sample_dataloader.py
# Average Run Time: 35-40 min
import json
import time
import pandas as pd
import sys
import os

# Ensuring the project's root directory is in the Python path for module importing
project_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
if project_root not in sys.path:
    sys.path.append(project_root)

# Importing the PineconeServerless class from the project's module
from src.pinecone.resonate_pinecone_functions import PineconeServerless


class TranscriptProcessor:
    """
    A class to process and upsert transcripts to Pinecone.

    Attributes:
        uuid_to_filename_dict (dict): A mapping from UUIDs to their respective transcript file names.
        pinecone (PineconeServerless): An instance of the PineconeServerless class for database operations.
    """

    def __init__(self):
        """
        Initializes the TranscriptProcessor with a predefined UUID to filename mapping and a PineconeServerless instance.
        """
        # Mapping UUIDs to their respective transcript file names
        self.uuid_to_filename_dict = {
             "52d105f8-1c80-4056-8253-732b9e2bec63": "office_relocation_1.csv",
                "9ed1fefa-db53-41fc-a21b-479b67e30073": "office_relocation_2.csv",
                "e993da88-0e17-4a35-ba9a-c03decca607b": "office_relocation_3.csv",
                "61d453f1-2852-48d9-a25a-b6e04c3c4908": "office_relocation_4.csv",
                "ba94585e-b0df-4633-bef2-a4f94f644c11": "Social_Media_-_Harmed_Teens.csv",
                "906c7694-0e33-4c8e-8f51-0365155fbb81": "Social_Media_-_Ruins_your_life.csv",
                "52d2dfe4-748b-4ecf-84fb-64be6ebcaeef": "ES2014a.Mix-Headset.csv",
                "1be8e439-45b3-4c97-9e4a-5c78c1a15e78": "ES2014b.Mix-Headset_1.csv",
                "a4b7b490-7b28-4744-85e5-d216f40ff52c": "ES2014b.Mix-Headset_2.csv",
                "b3821662-03f1-4349-8781-ba5f64439693": "ES2014c.Mix-Headset.csv",
                "95efa3c5-9770-4160-9f28-35350efb9f73": "Gitlab_Monthly_Release_Kickoff_1.csv",
                "85430eae-d466-4d63-9015-5835bbe71b90": "product_marketing_meeting.csv",
                "55d8afa8-a1bf-413c-a75c-b8c14da88d87": "Gitlab_Monthly_Release_Kickoff_2.csv",
                "15b7549d-4b3f-43b5-9507-85de435f1b4a": "2023-09-26_Architecture_Design_Workflow_New_Diffs_kickoff_call_1.csv",
                "875564dc-9954-41da-9084-ccf04ebffdb0": "2023-09-26_Architecture_Design_Workflow_New_Diffs_kickoff_call_2.csv",
                "72858a28-248d-4bef-af03-c62a3c285fbb": "2023-09-26_Architecture_Design_Workflow_New_Diffs_kickoff_call_3.csv",
                "4cbd0d4e-6cf9-4db4-bf15-f4f4e4d3d8d8": "2023-10-03-New_diffs_Architecture_Workflow.csv",
                "4badb5ba-ca92-4c3c-a7e9-0d49fc7a8137": "2023-10-10_New_diffs_architecture_workflow_weekly_EMEA_AMER_1.csv",
                "9c5aa3e4-b047-4f08-a838-9b665e251e4d": "2023-10-10_New_diffs_architecture_workflow_weekly_EMEA_AMER_2.csv",
                "d7c8e3b8-c6e0-4845-8669-f2f4ed1b8549": "2023-10-17_New_diffs_architecture_blueprint_1.csv",
                "876e67fa-314d-40e4-b942-21ca63e81995": "2023-10-17_New_diffs_architecture_blueprint_2.csv",
        }
        # Initializing a PineconeServerless instance for database operations
        self.pinecone = PineconeServerless()

    def load_json_config(self, json_file_path=".//config/config.json"):
        """
        Loads a JSON configuration file.

        Parameters:
            json_file_path (str): The path to the JSON configuration file.

        Returns:
            dict: The data loaded from the JSON file.
        """
        with open(json_file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        return data

    def pinecone_init_upsert(
        self, df_transcript, meeting_title, meeting_summary, meeting_uuid
    ):
        """
        Initializes and performs an upsert operation to Pinecone with transcript data.

        Parameters:
            df_transcript (DataFrame): The transcript data as a pandas DataFrame.
            meeting_title (str): The title of the meeting.
            meeting_summary (str): The summary of the meeting.
            meeting_uuid (str): The UUID of the meeting.

        Exceptions:
            Catches and prints any exceptions raised during the upsert operation.
        """
        try:
            self.pinecone.pinecone_upsert(
                df_transcript,
                meeting_uuid=meeting_uuid,
                meeting_video_file=False,
                meeting_title=meeting_title,
                meeting_summary=meeting_summary,
            )
            # Wait for a short period to ensure the upsert operation completes
            time.sleep(5)
        except Exception as e:
            print("Error upserting transcript to Pinecone: ", e)

    def process_transcripts(self):
        """
        Processes and upserts all transcripts to Pinecone based on the UUID to filename mapping and the summary data.
        """
        summary_file = "./data/summaryFiles/abstract_summary_data.csv"
        df_summary = pd.read_csv(summary_file)
        # Creating a dictionary from the summaries DataFrame
        df_summary_dict = df_summary.set_index("uuid")["text"].to_dict()

        transcript_folder = "./data/transcriptFiles/"
    
        for uuid, summary in df_summary_dict.items():
            if uuid in self.uuid_to_filename_dict:
                
                filename = self.uuid_to_filename_dict[uuid]
                df_transcript = pd.read_csv(transcript_folder + filename)
                meeting_title = filename.replace(".csv", "")
                meeting_uuid = uuid

                self.pinecone_init_upsert(
                    df_transcript, meeting_title, summary, meeting_uuid
                )
                time.sleep(20) # To prevent OPEN AI embedding limit error


if __name__ == "__main__":
    processor = TranscriptProcessor()
    processor.process_transcripts()