Duplicate from tomiwa1a/video-search

Browse files

Co-authored-by: Tomiwa Ademidun <[email protected]>

Files changed (6) hide show

.gitattributes +32 -0
README.md +22 -0
create_handler.ipynb +289 -0
handler.py +260 -0
requirements.txt +3 -0
sample1.flac +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,32 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+---
+license: gpl-3.0
+tags:
+- audio
+- automatic-speech-recognition
+- endpoints-template
+library_name: generic
+inference: false
+duplicated_from: tomiwa1a/video-search
+---
+# Video Search
+This project contains 3 different models that can be used for searching videos.
+1. Whisper to convert mp3 files to audio
+2. BART Sentence Transformer to generate vector embeddings from text
+3. BART LFQA to generate long form answers given a context
+For more context, see: [Atlas: Find Anything on Youtube](https://atila.ca/blog/tomiwa/atlas)
+Inspired by [philschmid/openai-whisper-endpoint](https://huggingface.co/philschmid/openai-whisper-endpoint)

create_handler.ipynb ADDED Viewed

	@@ -0,0 +1,289 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Setup & Installation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting requirements.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile requirements.txt\n",
+    "git+https://github.com/openai/whisper.git@8cf36f3508c9acd341a45eb2364239a3d81458b9"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -r requirements.txt --upgrade"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Test model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2022-09-23 20:32:18--  https://cdn-media.huggingface.co/speech_samples/sample1.flac\n",
+      "Resolving cdn-media.huggingface.co (cdn-media.huggingface.co)... 13.32.151.62, 13.32.151.23, 13.32.151.60, ...\n",
+      "Connecting to cdn-media.huggingface.co (cdn-media.huggingface.co)|13.32.151.62|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 282378 (276K) [audio/flac]\n",
+      "Saving to: ‘sample1.flac’\n",
+      "\n",
+      "sample1.flac        100%[===================>] 275.76K  --.-KB/s    in 0.003s  \n",
+      "\n",
+      "2022-09-23 20:32:18 (78.7 MB/s) - ‘sample1.flac’ saved [282378/282378]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget https://cdn-media.huggingface.co/speech_samples/sample1.flac"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████| 2.87G/2.87G [01:11<00:00, 42.9MiB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Detected language: english\n",
+      " going along slushy country roads and speaking to damp audiences in drafty school rooms day after day for a fortnight. he'll have to put in an appearance at some place of worship on sunday morning and he can come to us immediately afterwards.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import whisper\n",
+    "\n",
+    "model = whisper.load_model(\"large\")\n",
+    "result = model.transcribe(\"sample1.flac\")\n",
+    "print(result[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Create Custom Handler for Inference Endpoints\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting handler.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile handler.py\n",
+    "from typing import  Dict\n",
+    "from transformers.pipelines.audio_utils import ffmpeg_read\n",
+    "import whisper\n",
+    "import torch\n",
+    "\n",
+    "SAMPLE_RATE = 16000\n",
+    "\n",
+    "\n",
+    "\n",
+    "class EndpointHandler():\n",
+    "    def __init__(self, path=\"\"):\n",
+    "        # load the model\n",
+    "        self.model = whisper.load_model(\"medium\")\n",
+    "\n",
+    "\n",
+    "    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:\n",
+    "        \"\"\"\n",
+    "        Args:\n",
+    "            data (:obj:):\n",
+    "                includes the deserialized audio file as bytes\n",
+    "        Return:\n",
+    "            A :obj:`dict`:. base64 encoded image\n",
+    "        \"\"\"\n",
+    "        # process input\n",
+    "        inputs = data.pop(\"inputs\", data)\n",
+    "        audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)\n",
+    "        audio_tensor= torch.from_numpy(audio_nparray)\n",
+    "        \n",
+    "        # run inference pipeline\n",
+    "        result = self.model.transcribe(audio_nparray)\n",
+    "\n",
+    "        # postprocess the prediction\n",
+    "        return {\"text\": result[\"text\"]}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "test custom pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from handler import EndpointHandler\n",
+    "\n",
+    "# init handler\n",
+    "my_handler = EndpointHandler(path=\".\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/endpoints/openai-whisper-endpoint/handler.py:27: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at  ../torch/csrc/utils/tensor_numpy.cpp:178.)\n",
+      "  audio_tensor= torch.from_numpy(audio_nparray)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Detected language: english\n"
+     ]
+    }
+   ],
+   "source": [
+    "import base64\n",
+    "from PIL import Image\n",
+    "from io import BytesIO\n",
+    "import json\n",
+    "\n",
+    "# file reader\n",
+    "with open(\"sample1.flac\", \"rb\") as f:\n",
+    "  request = {\"inputs\": f.read()}\n",
+    "\n",
+    "\n",
+    "# test the handler\n",
+    "pred = my_handler(request)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'transcription': \" going along slushy country roads and speaking to damp audiences in draughty school rooms day after day for a fortnight. He'll have to put in an appearance at some place of worship on Sunday morning, and he can come to us immediately afterwards.\"}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'{\"transcription\": \" going along slushy country roads and speaking to damp audiences in draughty school rooms day after day for a fortnight. He\\'ll have to put in an appearance at some place of worship on Sunday morning, and he can come to us immediately afterwards.\"}'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "json.dumps({'transcription': \" going along slushy country roads and speaking to damp audiences in draughty school rooms day after day for a fortnight. He'll have to put in an appearance at some place of worship on Sunday morning, and he can come to us immediately afterwards.\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.13 ('dev': conda)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "f6dd96c16031089903d5a31ec148b80aeb0d39c32affb1a1080393235fbfa2fc"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

handler.py ADDED Viewed

	@@ -0,0 +1,260 @@

+"""
+https://huggingface.co/tomiwa1a/video-search
+"""
+from typing import Dict
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+import whisper
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import torch
+import pytube
+import time
+class EndpointHandler():
+    # load the model
+    WHISPER_MODEL_NAME = "tiny.en"
+    SENTENCE_TRANSFORMER_MODEL_NAME = "multi-qa-mpnet-base-dot-v1"
+    QUESTION_ANSWER_MODEL_NAME = "vblagoje/bart_lfqa"
+    SUMMARIZER_MODEL_NAME = "philschmid/bart-large-cnn-samsum"
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    device_number = 0 if torch.cuda.is_available() else -1
+    def __init__(self, path=""):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        device_number = 0 if torch.cuda.is_available() else -1
+        print(f'whisper and question_answer_model will use: {device}')
+        print(f'whisper and question_answer_model will use device_number: {device_number}')
+        t0 = time.time()
+        self.whisper_model = whisper.load_model(self.WHISPER_MODEL_NAME).to(device_number)
+        t1 = time.time()
+        total = t1 - t0
+        print(f'Finished loading whisper_model in {total} seconds')
+        t0 = time.time()
+        self.sentence_transformer_model = SentenceTransformer(self.SENTENCE_TRANSFORMER_MODEL_NAME)
+        t1 = time.time()
+        total = t1 - t0
+        print(f'Finished loading sentence_transformer_model in {total} seconds')
+        t0 = time.time()
+        self.summarizer = pipeline("summarization", model=self.SUMMARIZER_MODEL_NAME, device=device_number)
+        t1 = time.time()
+        total = t1 - t0
+        print(f'Finished loading summarizer in {total} seconds')
+        self.question_answer_tokenizer = AutoTokenizer.from_pretrained(self.QUESTION_ANSWER_MODEL_NAME)
+        t0 = time.time()
+        self.question_answer_model = AutoModelForSeq2SeqLM.from_pretrained \
+            (self.QUESTION_ANSWER_MODEL_NAME).to(device_number)
+        t1 = time.time()
+        total = t1 - t0
+        print(f'Finished loading question_answer_model in {total} seconds')
+    def __call__(self, data: Dict[str, str]) -> Dict:
+        """
+        Args:
+            data (:obj:):
+                includes the URL to video for transcription
+        Return:
+            A :obj:`dict`:. transcribed dict
+        """
+        # process input
+        print('data', data)
+        if "inputs" not in data:
+            raise Exception(f"data is missing 'inputs' key which  EndpointHandler expects. Received: {data}"
+                            f" See: https://huggingface.co/docs/inference-endpoints/guides/custom_handler#2-create-endpointhandler-cp")
+        video_url = data.pop("video_url", None)
+        query = data.pop("query", None)
+        long_form_answer = data.pop("long_form_answer", None)
+        summarize = data.pop("summarize", False)
+        encoded_segments = {}
+        if video_url:
+            video_with_transcript = self.transcribe_video(video_url)
+            video_with_transcript['transcript']['transcription_source'] = f"whisper_{self.WHISPER_MODEL_NAME}"
+            encode_transcript = data.pop("encode_transcript", True)
+            if encode_transcript:
+                encoded_segments = self.combine_transcripts(video_with_transcript)
+                encoded_segments = {
+                    "encoded_segments": self.encode_sentences(encoded_segments)
+                }
+            return {
+                **video_with_transcript,
+                **encoded_segments
+            }
+        elif summarize:
+            summary = self.summarize_video(data["segments"])
+            return {"summary": summary}
+        elif query:
+            if long_form_answer:
+                context = data.pop("context", None)
+                answer = self.generate_answer(query, context)
+                response = {
+                    "answer": answer
+                }
+                return response
+            else:
+                query = [{"text": query, "id": ""}] if isinstance(query, str) else query
+                encoded_segments = self.encode_sentences(query)
+                response = {
+                    "encoded_segments": encoded_segments
+                }
+                return response
+        else:
+            return {
+                "error": "'video_url' or 'query' must be provided"
+            }
+    def transcribe_video(self, video_url):
+        decode_options = {
+            # Set language to None to support multilingual,
+            # but it will take longer to process while it detects the language.
+            # Realized this by running in verbose mode and seeing how much time
+            # was spent on the decoding language step
+            "language": "en",
+            "verbose": True
+        }
+        yt = pytube.YouTube(video_url)
+        video_info = {
+            'id': yt.video_id,
+            'thumbnail': yt.thumbnail_url,
+            'title': yt.title,
+            'views': yt.views,
+            'length': yt.length,
+            # Althhough, this might seem redundant since we already have id
+            # but it allows the link to the video be accessed in 1-click in the API response
+            'url': f"https://www.youtube.com/watch?v={yt.video_id}"
+        }
+        stream = yt.streams.filter(only_audio=True)[0]
+        path_to_audio = f"{yt.video_id}.mp3"
+        stream.download(filename=path_to_audio)
+        t0 = time.time()
+        transcript = self.whisper_model.transcribe(path_to_audio, **decode_options)
+        t1 = time.time()
+        for segment in transcript['segments']:
+            # Remove the tokens array, it makes the response too verbose
+            segment.pop('tokens', None)
+        total = t1 - t0
+        print(f'Finished transcription in {total} seconds')
+        # postprocess the prediction
+        return {"transcript": transcript, 'video': video_info}
+    def encode_sentences(self, transcripts, batch_size=64):
+        """
+        Encoding all of our segments at once or storing them locally would require too much compute or memory.
+        So we do it in batches of 64
+        :param transcripts:
+        :param batch_size:
+        :return:
+        """
+        # loop through in batches of 64
+        all_batches = []
+        for i in tqdm(range(0, len(transcripts), batch_size)):
+            # find end position of batch (for when we hit end of data)
+            i_end = min(len(transcripts), i + batch_size)
+            # extract the metadata like text, start/end positions, etc
+            batch_meta = [{
+                **row
+            } for row in transcripts[i:i_end]]
+            # extract only text to be encoded by embedding model
+            batch_text = [
+                row['text'] for row in batch_meta
+            ]
+            # create the embedding vectors
+            batch_vectors = self.sentence_transformer_model.encode(batch_text).tolist()
+            batch_details = [
+                {
+                    **batch_meta[x],
+                    'vectors': batch_vectors[x]
+                } for x in range(0, len(batch_meta))
+            ]
+            all_batches.extend(batch_details)
+        return all_batches
+    def summarize_video(self, segments):
+        for index, segment in enumerate(segments):
+            segment['summary'] = self.summarizer(segment['text'])
+            segment['summary'] = segment['summary'][0]['summary_text']
+            print('index', index)
+            print('length', segment['length'])
+            print('text', segment['text'])
+            print('summary', segment['summary'])
+        return segments
+    def generate_answer(self, query, documents):
+        # concatenate question and support documents into BART input
+        conditioned_doc = "<P> " + " <P> ".join([d for d in documents])
+        query_and_docs = "question: {} context: {}".format(query, conditioned_doc)
+        model_input = self.question_answer_tokenizer(query_and_docs, truncation=False, padding=True,
+                                                     return_tensors="pt")
+        generated_answers_encoded = self.question_answer_model.generate(
+            input_ids=model_input["input_ids"].to(self.device),
+            attention_mask=model_input["attention_mask"].to(self.device),
+            min_length=64,
+            max_length=256,
+            do_sample=False,
+            early_stopping=True,
+            num_beams=8,
+            temperature=1.0,
+            top_k=None,
+            top_p=None,
+            eos_token_id=self.question_answer_tokenizer.eos_token_id,
+            no_repeat_ngram_size=3,
+            num_return_sequences=1)
+        answer = self.question_answer_tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,
+                                                             clean_up_tokenization_spaces=True)
+        return answer
+    @staticmethod
+    def combine_transcripts(video, window=6, stride=3):
+        """
+        :param video:
+        :param window: number of sentences to combine
+        :param stride: number of sentences to 'stride' over, used to create overlap
+        :return:
+        """
+        new_transcript_segments = []
+        video_info = video['video']
+        transcript_segments = video['transcript']['segments']
+        for i in tqdm(range(0, len(transcript_segments), stride)):
+            i_end = min(len(transcript_segments), i + window)
+            text = ' '.join(transcript['text']
+                            for transcript in
+                            transcript_segments[i:i_end])
+            # TODO: Should int (float to seconds) conversion happen at the API level?
+            start = int(transcript_segments[i]['start'])
+            end = int(transcript_segments[i]['end'])
+            new_transcript_segments.append({
+                **video_info,
+                **{
+                    'start': start,
+                    'end': end,
+                    'title': video_info['title'],
+                    'text': text,
+                    'id': f"{video_info['id']}-t{start}",
+                    'url': f"https://youtu.be/{video_info['id']}?t={start}",
+                    'video_id': video_info['id'],
+                }
+            })
+        return new_transcript_segments

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+git+https://github.com/openai/whisper.git@8cf36f3508c9acd341a45eb2364239a3d81458b9
+pytube==12.1.2
+sentence-transformers==2.2.2

sample1.flac ADDED Viewed

Binary file (282 kB). View file