File size: 3,198 Bytes
ba7deb1
 
fdc091a
2f591da
ba7deb1
fdc091a
ba7deb1
 
fdc091a
 
 
 
ba7deb1
d9a1859
 
ba7deb1
 
d9a1859
ba7deb1
 
779c2fa
 
 
 
 
fdc091a
 
 
 
 
a5a27a1
 
 
 
 
 
 
 
 
 
 
 
fdc091a
ba7deb1
 
 
fdc091a
 
ba7deb1
 
 
 
 
 
 
 
 
 
 
 
 
9ae78a1
ba7deb1
779c2fa
fdc091a
 
 
 
 
 
 
 
 
 
 
 
 
 
779c2fa
a5a27a1
d9a1859
ba7deb1
d9a1859
ba7deb1
 
 
 
fdc091a
779c2fa
8ba4837
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
import os
import re
import time

import markdown
import nomic
import numpy as np
import pandas as pd
from nomic import atlas, Nomic
from nomic.dataset import AtlasClass
from nomic.data_inference import NomicTopicOptions

from src.my_logger import setup_logger

NOMIC_KEY = os.getenv('NOMIC_KEY')
nomic.login(NOMIC_KEY)
logger = setup_logger(__name__)


def count_words(text):
    words = text.split()
    return len(words)


def convert_markdown_to_html(markdown_text):
    html = markdown.markdown(markdown_text)
    return html


def delete_old_nomic():
    logger.info(f"Trying to delete old version of nomic Atlas...")
    try:
        ac = AtlasClass()
        atlas_id = ac._get_dataset_by_slug_identifier("derek2/boru-subreddit-neural-search")['id']
        ac._delete_project_by_id(atlas_id)
        logger.info(f"Succeeded in deleting old version of nomic Atlas.")
        logger.info(f"Sleeping for 60s to wait for old version deletion on the server-side")
        time.sleep(60)
    except:
        logger.info(f"Failed to delete old version of nomic Atlas.")


def build_nomic(dataset):
    df = dataset['train'].to_pandas()

    non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'permalink', 'id', 'word_count',
                             'score', 'score_percentile', 'html_content', 'subreddit']

    # Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
    percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()

    # Ensure the bins are unique and include the maximum score
    bins = sorted(set(percentiles + [df['score'].max()]))

    # Define the labels for the percentile ranges
    # The number of labels should be one less than the number of bins
    labels = [int(i * 10) for i in range(len(bins) - 1)]

    # Add a 'percentile_ranges' column to the DataFrame
    # This assigns each score to its corresponding percentile range
    df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)

    df['word_count'] = df['content'].apply(count_words)
    df['html_content'] = df['content'].apply(convert_markdown_to_html)

    # Regex to extract subreddit
    subreddit_re = re.compile(r'r/(\w+)')
    def extract_subreddit(text):
        match = subreddit_re.search(text)
        if match:
            return match.group(1)
        return ''

    # Apply the function
    df['subreddit'] = df['content'].apply(extract_subreddit)

    topic_options = NomicTopicOptions(build_topic_model=True, community_description_target_field='subreddit')

    delete_old_nomic()

    # Create Atlas project
    logger.info(f"Trying to create new version of Atlas...")
    project = atlas.map_data(embeddings=np.stack(df['embedding'].values),
                             data=df[non_embedding_columns].to_dict(orient='records'),
                             id_field='id',
                             identifier='BORU Subreddit Neural Search',
                             topic_model=topic_options
                             )
    logger.info(f"Succeeded in creating new version of nomic Atlas: {project.slug}")