Spaces:
Runtime error
Runtime error
File size: 3,198 Bytes
ba7deb1 fdc091a 2f591da ba7deb1 fdc091a ba7deb1 fdc091a ba7deb1 d9a1859 ba7deb1 d9a1859 ba7deb1 779c2fa fdc091a a5a27a1 fdc091a ba7deb1 fdc091a ba7deb1 9ae78a1 ba7deb1 779c2fa fdc091a 779c2fa a5a27a1 d9a1859 ba7deb1 d9a1859 ba7deb1 fdc091a 779c2fa 8ba4837 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map
import os
import re
import time
import markdown
import nomic
import numpy as np
import pandas as pd
from nomic import atlas, Nomic
from nomic.dataset import AtlasClass
from nomic.data_inference import NomicTopicOptions
from src.my_logger import setup_logger
NOMIC_KEY = os.getenv('NOMIC_KEY')
nomic.login(NOMIC_KEY)
logger = setup_logger(__name__)
def count_words(text):
words = text.split()
return len(words)
def convert_markdown_to_html(markdown_text):
html = markdown.markdown(markdown_text)
return html
def delete_old_nomic():
logger.info(f"Trying to delete old version of nomic Atlas...")
try:
ac = AtlasClass()
atlas_id = ac._get_dataset_by_slug_identifier("derek2/boru-subreddit-neural-search")['id']
ac._delete_project_by_id(atlas_id)
logger.info(f"Succeeded in deleting old version of nomic Atlas.")
logger.info(f"Sleeping for 60s to wait for old version deletion on the server-side")
time.sleep(60)
except:
logger.info(f"Failed to delete old version of nomic Atlas.")
def build_nomic(dataset):
df = dataset['train'].to_pandas()
non_embedding_columns = ['date_utc', 'title', 'flair', 'poster', 'permalink', 'id', 'word_count',
'score', 'score_percentile', 'html_content', 'subreddit']
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
# Ensure the bins are unique and include the maximum score
bins = sorted(set(percentiles + [df['score'].max()]))
# Define the labels for the percentile ranges
# The number of labels should be one less than the number of bins
labels = [int(i * 10) for i in range(len(bins) - 1)]
# Add a 'percentile_ranges' column to the DataFrame
# This assigns each score to its corresponding percentile range
df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
df['word_count'] = df['content'].apply(count_words)
df['html_content'] = df['content'].apply(convert_markdown_to_html)
# Regex to extract subreddit
subreddit_re = re.compile(r'r/(\w+)')
def extract_subreddit(text):
match = subreddit_re.search(text)
if match:
return match.group(1)
return ''
# Apply the function
df['subreddit'] = df['content'].apply(extract_subreddit)
topic_options = NomicTopicOptions(build_topic_model=True, community_description_target_field='subreddit')
delete_old_nomic()
# Create Atlas project
logger.info(f"Trying to create new version of Atlas...")
project = atlas.map_data(embeddings=np.stack(df['embedding'].values),
data=df[non_embedding_columns].to_dict(orient='records'),
id_field='id',
identifier='BORU Subreddit Neural Search',
topic_model=topic_options
)
logger.info(f"Succeeded in creating new version of nomic Atlas: {project.slug}")
|