openfree commited on
Commit
7f8500d
ยท
verified ยท
1 Parent(s): dad05a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -51
app.py CHANGED
@@ -993,27 +993,75 @@ def refresh_data():
993
 
994
 
995
  target_datasets = {
996
- "aiqtech/kolaw": "https://huggingface.co/datasets/aiqtech/kolaw",
997
- "heegyu/korquad-v1-v2": "https://huggingface.co/datasets/heegyu/korquad-v1-v2",
998
- "nlpai-lab/kowiki": "https://huggingface.co/datasets/nlpai-lab/kowiki",
999
- "KETI-AIR/korpora": "https://huggingface.co/datasets/KETI-AIR/korpora",
1000
- "heegyu/korean-parallel-corpora": "https://huggingface.co/datasets/heegyu/korean-parallel-corpora",
1001
- "heegyu/korean-hate-speech": "https://huggingface.co/datasets/heegyu/korean-hate-speech",
1002
- "KETI-AIR/korean-parallel-corpora": "https://huggingface.co/datasets/KETI-AIR/korean-parallel-corpora",
1003
- "heegyu/korean-chatbot-data": "https://huggingface.co/datasets/heegyu/korean-chatbot-data",
1004
- "heegyu/korean-qa": "https://huggingface.co/datasets/heegyu/korean-qa",
1005
- "heegyu/korean-summarization": "https://huggingface.co/datasets/heegyu/korean-summarization",
1006
- "nlpai-lab/kullm-chat-v2": "https://huggingface.co/datasets/nlpai-lab/kullm-chat-v2",
1007
- "upstage/open-ko-llm-leaderboard": "https://huggingface.co/datasets/upstage/open-ko-llm-leaderboard"
1008
  }
1009
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1010
  def get_datasets_data(progress=gr.Progress()):
1011
  def calculate_rank(dataset_id, all_global_datasets, korea_datasets):
1012
- # ๊ธ€๋กœ๋ฒŒ ์ˆœ์œ„ ํ™•์ธ
1013
  global_rank = next((idx for idx, d in enumerate(all_global_datasets, 1)
1014
  if d.get('id', '').strip() == dataset_id.strip()), None)
1015
 
1016
- # Korea ๋ฐ์ดํ„ฐ์…‹์ธ ๊ฒฝ์šฐ
1017
  is_korea = any(d.get('id', '').strip() == dataset_id.strip() for d in korea_datasets)
1018
 
1019
  if is_korea:
@@ -1039,37 +1087,9 @@ def get_datasets_data(progress=gr.Progress()):
1039
  empty_df = pd.DataFrame(columns=['Global Rank', 'Dataset ID', 'Title', 'Downloads', 'Likes', 'Korea Search', 'URL'])
1040
  return fig, error_html, empty_df
1041
 
1042
- # ์ผ๋ฐ˜ ๋ฐ์ดํ„ฐ์…‹๊ณผ Korea ๊ด€๋ จ ๋ฐ์ดํ„ฐ์…‹ ๊ฐ€์ ธ์˜ค๊ธฐ
1043
- params = {
1044
- 'limit': 3000,
1045
- 'full': 'true'
1046
- }
1047
-
1048
- all_datasets_response = requests.get(
1049
- "https://huggingface.co/api/datasets",
1050
- headers={'Authorization': f'Bearer {HF_TOKEN}'},
1051
- params=params
1052
- )
1053
-
1054
- korea_params = {
1055
- 'search': 'korea',
1056
- 'limit': 3000,
1057
- 'full': 'true'
1058
- }
1059
 
1060
- korea_datasets_response = requests.get(
1061
- "https://huggingface.co/api/datasets",
1062
- headers={'Authorization': f'Bearer {HF_TOKEN}'},
1063
- params=korea_params
1064
- )
1065
-
1066
- all_global_datasets = all_datasets_response.json()
1067
- korea_datasets = korea_datasets_response.json()
1068
-
1069
- # ์‹œ๊ฐํ™”๋ฅผ ์œ„ํ•œ Figure ์ƒ์„ฑ
1070
- fig = go.Figure()
1071
-
1072
- # ์ˆœ์œ„ ์ •๋ณด ์ˆ˜์ง‘
1073
  filtered_datasets = []
1074
  for dataset_id in target_datasets.keys():
1075
  try:
@@ -1092,8 +1112,6 @@ def get_datasets_data(progress=gr.Progress()):
1092
  'title': dataset_data.get('title', 'No Title'),
1093
  'is_korea': is_korea
1094
  })
1095
-
1096
- print(f"Dataset {dataset_id}: Rank={rank}, Is Korea={is_korea}")
1097
  else:
1098
  filtered_datasets.append({
1099
  'id': dataset_id,
@@ -1107,12 +1125,13 @@ def get_datasets_data(progress=gr.Progress()):
1107
  print(f"Error processing {dataset_id}: {str(e)}")
1108
  continue
1109
 
1110
- # ์ˆœ์œ„๋กœ ์ •๋ ฌ
1111
  filtered_datasets.sort(key=lambda x: float('inf') if isinstance(x['global_rank'], str) else x['global_rank'])
1112
 
1113
  # ์‹œ๊ฐํ™” ๋ฐ์ดํ„ฐ ์ค€๋น„
1114
  valid_datasets = [d for d in filtered_datasets if isinstance(d['global_rank'], (int, float))]
1115
 
 
 
1116
  if valid_datasets:
1117
  ids = [d['id'] for d in valid_datasets]
1118
  ranks = [d['global_rank'] for d in valid_datasets]
@@ -1120,10 +1139,12 @@ def get_datasets_data(progress=gr.Progress()):
1120
  fig.add_trace(go.Bar(
1121
  x=ids,
1122
  y=[3001 - r for r in ranks],
1123
- text=[f"Rank: #{r}<br>Downloads: {format(d['downloads'], ',')}<br>Likes: {format(d['likes'], ',')}"
 
 
1124
  for r, d in zip(ranks, valid_datasets)],
1125
  textposition='auto',
1126
- marker_color=['rgba(255,0,0,0.6)' if d['is_korea'] else 'rgba(0,0,255,0.6)'
1127
  for d in valid_datasets],
1128
  opacity=0.8
1129
  ))
@@ -1198,8 +1219,6 @@ def get_datasets_data(progress=gr.Progress()):
1198
  } for d in filtered_datasets])
1199
 
1200
  progress(1.0, desc="Complete!")
1201
-
1202
-
1203
  return fig, html_content, df
1204
 
1205
  except Exception as e:
 
993
 
994
 
995
  target_datasets = {
996
+ "aiqtech/kolaw": "https://huggingface.co/datasets/aiqtech/kolaw"
997
+ # ํ•„์š”ํ•œ ๋ฐ์ดํ„ฐ์…‹ ์ถ”๊ฐ€
 
 
 
 
 
 
 
 
 
 
998
  }
999
 
1000
+ def get_korea_datasets():
1001
+ """Korea ๊ด€๋ จ ๋ฐ์ดํ„ฐ์…‹ ๊ฒ€์ƒ‰"""
1002
+ params = {
1003
+ "search": "korea",
1004
+ "full": "True",
1005
+ "limit": 1000
1006
+ }
1007
+
1008
+ try:
1009
+ response = requests.get(
1010
+ "https://huggingface.co/api/datasets",
1011
+ headers={'Authorization': f'Bearer {HF_TOKEN}'},
1012
+ params=params
1013
+ )
1014
+
1015
+ if response.status_code == 200:
1016
+ return response.json()
1017
+ else:
1018
+ print(f"Failed to fetch Korea datasets: {response.status_code}")
1019
+ return []
1020
+ except Exception as e:
1021
+ print(f"Error fetching Korea datasets: {str(e)}")
1022
+ return []
1023
+
1024
+ def get_all_datasets(limit=3000):
1025
+ """๋ชจ๋“  ๋ฐ์ดํ„ฐ์…‹๊ณผ Korea ๊ด€๋ จ ๋ฐ์ดํ„ฐ์…‹ ๊ฐ€์ ธ์˜ค๊ธฐ"""
1026
+ all_datasets = []
1027
+ page_size = 1000
1028
+
1029
+ for offset in range(0, limit, page_size):
1030
+ params = {
1031
+ 'limit': min(page_size, limit - offset),
1032
+ 'full': 'True',
1033
+ 'offset': offset
1034
+ }
1035
+
1036
+ response = requests.get(
1037
+ "https://huggingface.co/api/datasets",
1038
+ headers={'Authorization': f'Bearer {HF_TOKEN}'},
1039
+ params=params
1040
+ )
1041
+
1042
+ if response.status_code == 200:
1043
+ all_datasets.extend(response.json())
1044
+ print(f"Fetched datasets {offset+1} to {offset+len(response.json())}")
1045
+ else:
1046
+ print(f"Failed to fetch datasets at offset {offset}: {response.status_code}")
1047
+ break
1048
+
1049
+ # Korea ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ถ”๊ฐ€
1050
+ korea_datasets = get_korea_datasets()
1051
+ existing_ids = {dataset.get('id', '') for dataset in all_datasets}
1052
+
1053
+ for korea_dataset in korea_datasets:
1054
+ if korea_dataset.get('id', '') not in existing_ids:
1055
+ all_datasets.append(korea_dataset)
1056
+ existing_ids.add(korea_dataset.get('id', ''))
1057
+
1058
+ return all_datasets[:limit]
1059
+
1060
  def get_datasets_data(progress=gr.Progress()):
1061
  def calculate_rank(dataset_id, all_global_datasets, korea_datasets):
 
1062
  global_rank = next((idx for idx, d in enumerate(all_global_datasets, 1)
1063
  if d.get('id', '').strip() == dataset_id.strip()), None)
1064
 
 
1065
  is_korea = any(d.get('id', '').strip() == dataset_id.strip() for d in korea_datasets)
1066
 
1067
  if is_korea:
 
1087
  empty_df = pd.DataFrame(columns=['Global Rank', 'Dataset ID', 'Title', 'Downloads', 'Likes', 'Korea Search', 'URL'])
1088
  return fig, error_html, empty_df
1089
 
1090
+ all_global_datasets = get_all_datasets(limit=3000)
1091
+ korea_datasets = get_korea_datasets()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1092
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1093
  filtered_datasets = []
1094
  for dataset_id in target_datasets.keys():
1095
  try:
 
1112
  'title': dataset_data.get('title', 'No Title'),
1113
  'is_korea': is_korea
1114
  })
 
 
1115
  else:
1116
  filtered_datasets.append({
1117
  'id': dataset_id,
 
1125
  print(f"Error processing {dataset_id}: {str(e)}")
1126
  continue
1127
 
 
1128
  filtered_datasets.sort(key=lambda x: float('inf') if isinstance(x['global_rank'], str) else x['global_rank'])
1129
 
1130
  # ์‹œ๊ฐํ™” ๋ฐ์ดํ„ฐ ์ค€๋น„
1131
  valid_datasets = [d for d in filtered_datasets if isinstance(d['global_rank'], (int, float))]
1132
 
1133
+ fig = go.Figure()
1134
+
1135
  if valid_datasets:
1136
  ids = [d['id'] for d in valid_datasets]
1137
  ranks = [d['global_rank'] for d in valid_datasets]
 
1139
  fig.add_trace(go.Bar(
1140
  x=ids,
1141
  y=[3001 - r for r in ranks],
1142
+ text=[f"Rank: #{r}<br>{'๐Ÿ‡ฐ๐Ÿ‡ท Korea Dataset<br>' if d['is_korea'] else ''}"
1143
+ f"Downloads: {format(d['downloads'], ',')}<br>"
1144
+ f"Likes: {format(d['likes'], ',')}"
1145
  for r, d in zip(ranks, valid_datasets)],
1146
  textposition='auto',
1147
+ marker_color=['rgba(255,0,0,0.6)' if d['is_korea'] else 'rgba(0,0,255,0.6)'
1148
  for d in valid_datasets],
1149
  opacity=0.8
1150
  ))
 
1219
  } for d in filtered_datasets])
1220
 
1221
  progress(1.0, desc="Complete!")
 
 
1222
  return fig, html_content, df
1223
 
1224
  except Exception as e: