File size: 3,986 Bytes
9ae1b66
 
 
 
 
 
 
ba7deb1
9ae1b66
ba7deb1
ef9cbc8
9ae1b66
 
 
 
28757aa
9ae1b66
 
 
ba7deb1
9ae1b66
 
 
 
 
 
f2b70d1
9ae1b66
 
 
 
55b1e7d
 
f2b70d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ae1b66
 
08d3eb0
 
aeb2044
9ae1b66
 
 
aeb2044
 
 
9ae1b66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef9cbc8
9ae1b66
 
ba7deb1
9ae1b66
 
 
ef9cbc8
 
 
 
ba7deb1
 
 
 
9ae1b66
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
from pathlib import Path

import gradio as gr
from huggingface_hub import WebhookPayload, WebhooksServer

from src.my_logger import setup_logger
from src.utilities import load_datasets, merge_and_update_datasets
from src.visualize_logs import log_file_to_html_string
from src.build_nomic import build_nomic
from src.readme_update import update_dataset_readme

proj_dir = Path(__name__).parent

logger = setup_logger(__name__)
logger.info("Starting Application...")

SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
HUGGINGFACE_AUTH_TOKEN = os.environ["HUGGINGFACE_AUTH_TOKEN"]
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')

intro_md = """
# Processing BORU
## Creation Details
This space is triggered by a webhook for changes on 
[derek-thomas/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-bestofredditorupdates).
 It then takes the updates from that dataset and get embeddings and puts the results in 
[https://huggingface.co/datasets/derek-thomas/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/derek-thomas/reddit-bestofredditorupdates-processed)

Check out the original on [Nomic](https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map)

## What is this for beginners?
This is a space to visually search the subreddit /r/bestofredditorupdates. Have you ever been curious to search for stories
that are similar to one of your favorites? This can help!

- Each dot represents a post (try clicking on one)
- Closer dots are similar in topic
- Use the filters on the left to help you narrow down what you are looking for
    - The lasso can help you search in a smaller range that you drag with your mouse
    - The filter can help you narrow by field, 
        - Filtering posts that are `CONCLUDED`
        - Filtering popular posts
        - Filtering by date
    - The search can help you look by keyword
    
## Todo
- Ignore the colors for now, I need to clean that up :) 
- I need to integrate with Nomic's semantic search
"""

url = "https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map/cdd8c890-2fac-4ea6-91f8-e6821203cfcb"
html_str = f'<iframe src={url} style="border:none;height:1024px;width:100%" allow="clipboard-read; clipboard-write" title="Nomic Atlas">'

with gr.Blocks() as ui:
    with gr.Tab("Application"):
        gr.Markdown(intro_md)
        gr.HTML(html_str)
    with gr.Tab("Logs"):
        gr.Markdown("# Logs")
        output = gr.HTML(log_file_to_html_string, every=1)

app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)


@app.add_webhook("/dataset_repo")
async def community(payload: WebhookPayload):
    if payload.event.scope.startswith("repo"):
        logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")
    else:
        return

    logger.info(f"Loading new dataset...")
    dataset, original_dataset = load_datasets()
    logger.info(f"Loaded new dataset")

    logger.info(f"Merging and Updating row...")
    dataset, updated_row_count = merge_and_update_datasets(dataset, original_dataset)

    # Push the augmented dataset to the Hugging Face hub
    logger.info(f"Pushing processed data to the Hugging Face Hub...")
    dataset.push_to_hub(PROCESSED_DATASET, token=HUGGINGFACE_AUTH_TOKEN)
    logger.info(f"Pushed processed data to the Hugging Face Hub")

    update_dataset_readme(dataset_name=PROCESSED_DATASET, subreddit=SUBREDDIT, new_rows=updated_row_count)
    logger.info(f"Updated README.")

    # Build Nomic
    logger.info(f"Building Nomic...")
    build_nomic(dataset=dataset)
    logger.info(f"Built Nomic")

if __name__ == '__main__':
    app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
    # ui.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860)