derek-thomas
commited on
Commit
·
779c2fa
1
Parent(s):
7fa626d
Updating column names
Browse files- src/build_nomic.py +10 -3
src/build_nomic.py
CHANGED
|
@@ -10,11 +10,16 @@ NOMIC_KEY = os.getenv('NOMIC_KEY')
|
|
| 10 |
nomic.login(NOMIC_KEY)
|
| 11 |
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
def build_nomic(dataset):
|
| 14 |
df = dataset['train'].to_pandas()
|
| 15 |
|
| 16 |
-
non_embedding_columns = ['date_utc', 'title', 'flair', 'content', 'poster', 'permalink', 'id', '
|
| 17 |
-
'score', '
|
| 18 |
|
| 19 |
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
|
| 20 |
percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
|
|
@@ -30,9 +35,11 @@ def build_nomic(dataset):
|
|
| 30 |
# This assigns each score to its corresponding percentile range
|
| 31 |
df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
|
| 32 |
|
|
|
|
|
|
|
| 33 |
# Create Atlas project
|
| 34 |
project = atlas.map_data(embeddings=np.stack(df['embedding'].values),
|
| 35 |
data=df[non_embedding_columns].to_dict(orient='records'),
|
| 36 |
id_field='id',
|
| 37 |
identifier='BORU Subreddit Neural Search',
|
| 38 |
-
)
|
|
|
|
| 10 |
nomic.login(NOMIC_KEY)
|
| 11 |
|
| 12 |
|
| 13 |
+
def count_words(text):
|
| 14 |
+
words = text.split()
|
| 15 |
+
return len(words)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
def build_nomic(dataset):
|
| 19 |
df = dataset['train'].to_pandas()
|
| 20 |
|
| 21 |
+
non_embedding_columns = ['date_utc', 'title', 'flair', 'content', 'poster', 'permalink', 'id', 'word_count',
|
| 22 |
+
'score', 'score_percentile']
|
| 23 |
|
| 24 |
# Calculate the 0th, 10th, 20th, ..., 90th percentiles for the 'score' column
|
| 25 |
percentiles = df['score'].quantile([0, .1, .2, .3, .4, .5, .6, .7, .8, .9]).tolist()
|
|
|
|
| 35 |
# This assigns each score to its corresponding percentile range
|
| 36 |
df['score_percentile'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)
|
| 37 |
|
| 38 |
+
df['word_count'] = df['content'].apply(count_words)
|
| 39 |
+
|
| 40 |
# Create Atlas project
|
| 41 |
project = atlas.map_data(embeddings=np.stack(df['embedding'].values),
|
| 42 |
data=df[non_embedding_columns].to_dict(orient='records'),
|
| 43 |
id_field='id',
|
| 44 |
identifier='BORU Subreddit Neural Search',
|
| 45 |
+
)
|