In [1]:
import numpy as np
import pandas as pd
import pyarrow

In [2]:
df = pd.read_parquet('v2ga_w_embeddings_half.parquet')

In [3]:
def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    v1_norm = np.linalg.norm(v1)
    v2_norm = np.linalg.norm(v2)
    if v1_norm == 0.0 or v2_norm == 0.0:
        return np.nan
    else:
        similarity = dot_product / (v1_norm * v2_norm)
        return similarity

In [25]:
def relevance_scores(query_embed,df,embeddings):
    scores = [cosine_similarity(query_embed, v2) for v2 in df[embeddings]]
    scores = pd.Series(scores)
    # sort scores in descending order
    scores = scores.sort_values(ascending=False)
    # set first score to 0
    scores.iloc[0] = 0
    return(scores)

In [46]:
def semantic_search(artist, title):

    chosen_song = df[(df['artist'] == artist) & (df['title'] == title)]

    scores_glove = relevance_scores(chosen_song["embedding_glove"].values[0],df,"embedding_glove")
    index_glove = scores_glove.idxmax()
    result_glove = df.iloc[index_glove][['title', 'artist', 'lyrics']]
    result_glove['lyrics'] = result_glove['lyrics'].replace('\n', '. ')

    scores_minilm = relevance_scores(chosen_song["embedding_minilm"].values[0],df,"embedding_minilm")
    index_minilm = scores_minilm.idxmax()
    result_minilm = df.iloc[index_minilm][['title', 'artist', 'lyrics']]
    result_minilm['lyrics'] = result_minilm['lyrics'].replace('\n', '. ')

    scores_roberta = relevance_scores(chosen_song["embedding_roberta"].values[0],df,"embedding_roberta")
    index_roberta = scores_roberta.idxmax()
    result_roberta = df.iloc[index_roberta][['title', 'artist', 'lyrics']]
    result_roberta['lyrics'] = result_roberta['lyrics'].replace('\n', '. ')

    scores_gpt = relevance_scores(chosen_song["embedding_gpt"].values[0],df,"embedding_gpt")
    index_gpt = scores_gpt.idxmax()
    result_gpt = df.iloc[index_gpt][['title', 'artist', 'lyrics']]
    result_gpt['lyrics'] = result_gpt['lyrics'].replace('\n', '. ')

    chosen_song = chosen_song[['title', 'artist', 'lyrics']].iloc[0]
    chosen_song['lyrics'] = chosen_song['lyrics'].replace('\n', '. ')

    results = {
        'chosen_song': chosen_song.to_dict(),
        'glove': result_glove.to_dict(),
        'minilm': result_minilm.to_dict(),
        'roberta': result_roberta.to_dict(),
        'gpt': result_gpt.to_dict()
    }

    return results

In [47]:
semantic_search("Tom Petty", "Century City")

{'chosen_song': {'title': 'Century City',
  'artist': 'Tom Petty',
  'lyrics': "Tom Petty\nMiscellaneous\nCentury City\nSometimes I wanna leave here\nSometimes I wanna go right back where I came from\nBack where I belong\nBut it never lasts for too long, it always goes away\nAnd I still don't look for reasons\nThat's much too hard these days\nWhy worry 'bout the rain?\nWhy worry 'bout the thunder?\nCentury City's got everything covered\nWell, your mama gave you lovin'\nMama held you near\nNow mama can't do nothin'\nBaby, mama just ain't here\nAnd you can pretend all you want to do\nBut that won't work no more\nAnd you can't run back to daddy\nYou tried that once before\nBut why worry 'bout your daddy?\nWhy worry 'bout your mother?\nCentury City's got everything covered\nWe're gonna live in Century City\nGo ahead and give in (Century City) like modern men\nAnd modern girls, we're gonna live in the modern world\nSometimes I get discouraged\nSometimes I feel so down\nSometimes I get so wo

In [49]:
artist = "Tom Petty"
title = "Century City"

chosen_song = df[(df['artist'] == artist) & (df['title'] == title)]
chosen_song = chosen_song[['title', 'artist', 'lyrics']].iloc[0]
chosen_song['lyrics'] = chosen_song['lyrics'].replace('\n', '. ')

chosen_song

title                                          Century City
artist                                            Tom Petty
lyrics    Tom Petty. Miscellaneous. Century City. Someti...
Name: 3833, dtype: object

In [35]:
chosen_song

Unnamed: 0,title,artist,lyrics
3833,Century City,Tom Petty,Tom Petty\nMiscellaneous\nCentury City\nSometi...
