patrickvonplaten commited on
Commit
9047602
·
1 Parent(s): 56f4087
Files changed (1) hide show
  1. upload.py +54 -0
upload.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from huggingface_hub import HfApi
3
+ from huggingface_hub import hf_hub_download
4
+ import huggingface_hub
5
+ from huggingface_hub import get_repo_discussions
6
+ from bs4 import BeautifulSoup
7
+ import json
8
+
9
+ # repo_id = "stabilityai/stable-diffusion"
10
+ repo_id = "huggingface-projects/diffuse-the-rest"
11
+ dataset_repo_id = "triple-t/dummy"
12
+ path = "/home/patrick/image_cache"
13
+
14
+ print("retrieve images 0...")
15
+ discussions_list = list(get_repo_discussions(repo_id=repo_id, repo_type="space"))
16
+ print("retrieve images 1...")
17
+ all_data = []
18
+ for i, disc in enumerate(discussions_list[:5]):
19
+ disc = huggingface_hub.get_discussion_details(repo_id=repo_id, repo_type="space", discussion_num=disc.num)
20
+ page = BeautifulSoup(disc.events[0]._event["data"]["latest"]["raw"])
21
+ image_urls = [link.get('src') for link in page.findAll('img')]
22
+ data = {
23
+ "discussion_number": i,
24
+ "data": {
25
+ "prompt": disc.title,
26
+ "images": image_urls,
27
+ }
28
+ }
29
+ if not image_urls:
30
+ continue
31
+ else:
32
+ all_data.append(data)
33
+
34
+
35
+ file_name = "_".join(repo_id.split("/")) + ".json"
36
+ api = HfApi()
37
+
38
+ path = hf_hub_download(repo_id=dataset_repo_id, filename=file_name, cache_dir=path, repo_type="dataset")
39
+
40
+ with open(path, "r") as f:
41
+ data = json.load(path)
42
+
43
+ data += all_data
44
+ import ipdb; ipdb.set_trace()
45
+
46
+ with open(path, "w") as f:
47
+ f.write(json.dumps(all_data, sort_keys=True, indent=4))
48
+
49
+ api.upload_file(
50
+ path_or_fileobj=path,
51
+ path_in_repo=file_name,
52
+ repo_id=dataset_repo_id,
53
+ repo_type="dataset",
54
+ )