|
<!DOCTYPE html> |
|
<html lang="en"> |
|
<head> |
|
<meta charset="UTF-8" /> |
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
|
<title>Hub Semantic Search</title> |
|
<script src="https://cdn.tailwindcss.com"></script> |
|
<script src="https://unpkg.com/lucide@latest"></script> |
|
<script src="https://cdn.jsdelivr.net/npm/[email protected]/lodash.min.js"></script> |
|
</head> |
|
<body> |
|
<div class="w-full max-w-4xl mx-auto p-4 space-y-8"> |
|
<h1 class="text-3xl font-bold text-gray-800">Hub Semantic Search</h1> |
|
|
|
<div |
|
class="bg-gradient-to-br from-blue-50 to-indigo-50 p-8 rounded-xl shadow-sm border border-blue-100 mb-8" |
|
> |
|
<h2 |
|
class="text-xl font-semibold mb-4 text-gray-800 flex items-center gap-2" |
|
> |
|
<i data-lucide="search" class="text-blue-500"></i> |
|
Welcome to Hub Semantic Search |
|
</h2> |
|
<p class="text-gray-700 mb-4"> |
|
Find and explore the π€ Hub using via semantic search on LLM generated |
|
summaries! |
|
</p> |
|
|
|
<div class="bg-blue-100 text-blue-800 px-4 py-2 rounded-md mb-4"> |
|
<p class="flex items-center gap-2"> |
|
<i data-lucide="info"></i> Currently supporting dataset search only. |
|
Model search coming soon! |
|
</p> |
|
</div> |
|
|
|
<button |
|
onclick="toggleAccordion()" |
|
id="accordionButton" |
|
class="text-blue-500 hover:text-blue-700 flex items-center gap-2 mb-4" |
|
> |
|
<i |
|
data-lucide="chevron-right" |
|
id="accordionIcon" |
|
class="transition-transform" |
|
></i> |
|
<span>How it works</span> |
|
</button> |
|
|
|
<div id="accordionContent" class="hidden"> |
|
<ul class="list-disc list-inside space-y-2 text-gray-600 ml-4"> |
|
<li> |
|
<strong>AI-Generated Summaries:</strong> Each dataset is indexed |
|
using a concise, one-sentence summary generated by a large |
|
language model trained on thousands of Hugging Face dataset cards |
|
</li> |
|
<li> |
|
<strong>Semantic Search:</strong> Enter keywords or descriptions |
|
to find semantically similar resources based on these AI-generated |
|
summaries |
|
</li> |
|
<li> |
|
<strong>Find Similar:</strong> Enter a dataset ID (e.g., |
|
"airtrain-ai/fineweb-edu-fortified") to discover related resources |
|
using semantic matching |
|
</li> |
|
</ul> |
|
</div> |
|
</div> |
|
|
|
<div class="tabs w-full"> |
|
<div class="tab-list flex gap-2 border-b mb-6"> |
|
<button |
|
onclick="switchTab('search')" |
|
id="searchTab" |
|
class="tab-trigger active px-6 py-3 flex items-center gap-2 border-b-2 border-transparent hover:bg-gray-50 transition-colors" |
|
> |
|
<i data-lucide="search"></i> Search |
|
</button> |
|
<button |
|
onclick="switchTab('similar')" |
|
id="similarTab" |
|
class="tab-trigger px-6 py-3 flex items-center gap-2 border-b-2 border-transparent hover:bg-gray-50 transition-colors" |
|
> |
|
<i data-lucide="arrow-right"></i> Find Similar |
|
</button> |
|
</div> |
|
|
|
<div id="searchContent" class="tab-content space-y-4"> |
|
<div |
|
class="card bg-white p-8 rounded-xl shadow-sm border border-gray-100" |
|
> |
|
<p class="text-gray-600 mb-4"> |
|
Enter keywords to search through dataset descriptions. The search |
|
will automatically update as you type. |
|
</p> |
|
<div class="relative"> |
|
<input |
|
type="text" |
|
id="searchInput" |
|
placeholder="Type to search (minimum 3 characters)..." |
|
class="w-full p-3 border rounded-lg pr-10 focus:ring-2 focus:ring-blue-100 focus:border-blue-300 transition-all outline-none" |
|
/> |
|
<div id="searchLoader" class="hidden absolute right-3 top-2"> |
|
<i data-lucide="loader-2" class="animate-spin"></i> |
|
</div> |
|
</div> |
|
</div> |
|
</div> |
|
|
|
<div id="similarContent" class="hidden tab-content space-y-4"> |
|
<div |
|
class="card bg-white p-8 rounded-xl shadow-sm border border-gray-100" |
|
> |
|
<p class="text-gray-600 mb-4"> |
|
Enter a dataset ID to find similar datasets. You can also click |
|
"Find Similar" on any search result to quickly find related |
|
datasets. |
|
</p> |
|
<div class="flex gap-3"> |
|
<input |
|
type="text" |
|
id="datasetInput" |
|
placeholder="Enter dataset ID..." |
|
class="w-full p-3 border rounded-lg focus:ring-2 focus:ring-blue-100 focus:border-blue-300 transition-all outline-none" |
|
/> |
|
<button |
|
onclick="findSimilarDatasets()" |
|
class="px-6 py-3 bg-blue-500 hover:bg-blue-600 text-white rounded-lg transition-colors flex items-center gap-2" |
|
> |
|
<i data-lucide="search"></i> |
|
Search |
|
</button> |
|
<div id="similarLoader" class="hidden"> |
|
<i data-lucide="loader-2" class="animate-spin"></i> |
|
</div> |
|
</div> |
|
</div> |
|
</div> |
|
|
|
<div |
|
id="errorMessage" |
|
class="hidden mt-4 p-4 text-red-600 bg-red-50 rounded-md" |
|
></div> |
|
|
|
<div id="resultsContainer" class="mt-6 space-y-4"></div> |
|
</div> |
|
</div> |
|
|
|
<style> |
|
.tab-trigger.active { |
|
border-bottom-color: #3b82f6; |
|
color: #3b82f6; |
|
} |
|
</style> |
|
|
|
<script> |
|
|
|
const API_URL = |
|
"https://davanstrien-huggingface-datasets-search-v2.hf.space"; |
|
const MIN_SEARCH_LENGTH = 3; |
|
const DEBOUNCE_MS = 300; |
|
const RESULTS_PER_PAGE = 5; |
|
const MAX_RESULTS = 100; |
|
let currentPage = 1; |
|
|
|
|
|
lucide.createIcons(); |
|
|
|
|
|
function switchTab(tabId) { |
|
currentPage = 1; |
|
document |
|
.querySelectorAll(".tab-content") |
|
.forEach((content) => content.classList.add("hidden")); |
|
document |
|
.querySelectorAll(".tab-trigger") |
|
.forEach((trigger) => trigger.classList.remove("active")); |
|
|
|
document.getElementById(`${tabId}Content`).classList.remove("hidden"); |
|
document.getElementById(`${tabId}Tab`).classList.add("active"); |
|
} |
|
|
|
|
|
function createResultCard(result) { |
|
const cardHtml = ` |
|
<div class="card bg-white p-6 rounded-lg shadow hover:shadow-md transition-shadow"> |
|
<div class="flex items-start justify-between"> |
|
<div class="space-y-2 w-full"> |
|
<div class="flex items-center justify-between"> |
|
<div class="flex items-center gap-2"> |
|
<i data-lucide="database" class="text-blue-500"></i> |
|
<h3 class="text-lg font-semibold">${ |
|
result.dataset_id |
|
}</h3> |
|
</div> |
|
<div class="flex items-center gap-2"> |
|
<div class="flex items-center gap-4 text-sm text-gray-500 mr-4"> |
|
<span class="flex items-center gap-1"> |
|
<i data-lucide="heart" class="w-4 h-4"></i> |
|
${result.likes} |
|
</span> |
|
<span class="flex items-center gap-1"> |
|
<i data-lucide="download" class="w-4 h-4"></i> |
|
${result.downloads} |
|
</span> |
|
</div> |
|
<span class="bg-blue-50 px-2 py-1 rounded text-sm"> |
|
${(result.similarity * 100).toFixed( |
|
1 |
|
)}% match |
|
</span> |
|
<button |
|
onclick="findSimilarFromResult('${ |
|
result.dataset_id |
|
}')" |
|
class="flex items-center gap-1 text-sm text-blue-500 hover:text-blue-700" |
|
> |
|
<i data-lucide="arrow-right"></i> |
|
Find Similar |
|
</button> |
|
</div> |
|
</div> |
|
<p class="text-sm text-gray-600">${result.summary}</p> |
|
|
|
<!-- Add preview section that starts hidden --> |
|
<div id="preview-section-${ |
|
result.dataset_id |
|
}" class="mt-4 border-t pt-4 hidden"> |
|
<button |
|
onclick="togglePreview('${result.dataset_id}')" |
|
class="flex items-center gap-2 text-sm text-gray-600 hover:text-gray-800" |
|
> |
|
<i data-lucide="chevron-right" id="preview-icon-${ |
|
result.dataset_id |
|
}" class="transition-transform"></i> |
|
Preview Dataset |
|
</button> |
|
<div id="preview-content-${ |
|
result.dataset_id |
|
}" class="hidden mt-4"> |
|
<iframe |
|
src="https://huggingface.co/datasets/${ |
|
result.dataset_id |
|
}/embed/viewer/default/train" |
|
frameborder="0" |
|
width="100%" |
|
height="560px" |
|
></iframe> |
|
</div> |
|
</div> |
|
|
|
<a href="https://huggingface.co/datasets/${ |
|
result.dataset_id |
|
}" |
|
target="_blank" |
|
class="inline-flex items-center gap-1 text-sm text-blue-500 hover:text-blue-700 mt-2"> |
|
<i data-lucide="external-link" class="w-4 h-4"></i> |
|
View on Hugging Face Hub |
|
</a> |
|
</div> |
|
</div> |
|
</div> |
|
`; |
|
|
|
|
|
checkDatasetValidity(result.dataset_id); |
|
|
|
return cardHtml; |
|
} |
|
|
|
|
|
async function checkDatasetValidity(datasetId) { |
|
try { |
|
const response = await fetch( |
|
`https://datasets-server.huggingface.co/is-valid?dataset=${datasetId}` |
|
); |
|
const data = await response.json(); |
|
|
|
|
|
if (data.viewer) { |
|
const previewSection = document.getElementById( |
|
`preview-section-${datasetId}` |
|
); |
|
if (previewSection) { |
|
previewSection.classList.remove("hidden"); |
|
} |
|
} |
|
} catch (error) { |
|
console.error( |
|
`Failed to check validity for dataset ${datasetId}:`, |
|
error |
|
); |
|
} |
|
} |
|
|
|
|
|
const searchDatasets = _.debounce(async (query, page = 1) => { |
|
if (query.length < MIN_SEARCH_LENGTH) { |
|
document.getElementById("resultsContainer").innerHTML = ""; |
|
return; |
|
} |
|
|
|
document.getElementById("searchLoader").classList.remove("hidden"); |
|
document.getElementById("errorMessage").classList.add("hidden"); |
|
|
|
try { |
|
const response = await fetch( |
|
`${API_URL}/search/datasets?query=${encodeURIComponent(query)}&k=${ |
|
RESULTS_PER_PAGE * page |
|
}` |
|
); |
|
if (!response.ok) throw new Error("Search failed"); |
|
|
|
const data = await response.json(); |
|
console.log("Search results:", data); |
|
displayResults(data.results, page); |
|
} catch (error) { |
|
console.error("Search error:", error); |
|
showError("Failed to perform search. Please try again."); |
|
} finally { |
|
document.getElementById("searchLoader").classList.add("hidden"); |
|
} |
|
}, DEBOUNCE_MS); |
|
|
|
|
|
async function findSimilarDatasets(page = 1) { |
|
const datasetId = document.getElementById("datasetInput").value; |
|
if (!datasetId) return; |
|
|
|
document.getElementById("similarLoader").classList.remove("hidden"); |
|
document.getElementById("errorMessage").classList.add("hidden"); |
|
|
|
try { |
|
const response = await fetch( |
|
`${API_URL}/similarity/datasets?dataset_id=${encodeURIComponent( |
|
datasetId |
|
)}&k=${RESULTS_PER_PAGE * page}` |
|
); |
|
if (!response.ok) throw new Error("Similarity search failed"); |
|
|
|
const data = await response.json(); |
|
displayResults(data.results, page); |
|
} catch (error) { |
|
showError("Failed to find similar datasets. Please try again."); |
|
} finally { |
|
document.getElementById("similarLoader").classList.add("hidden"); |
|
} |
|
} |
|
|
|
|
|
function displayResults(results, page = 1) { |
|
const container = document.getElementById("resultsContainer"); |
|
console.log("Displaying results:", results); |
|
if (results && results.length > 0) { |
|
container.innerHTML = ` |
|
<div class="flex justify-between items-center"> |
|
<h2 class="text-lg font-semibold">Results</h2> |
|
<span class="text-sm text-gray-500">Found ${ |
|
results.length |
|
} results</span> |
|
</div> |
|
${results.map((result) => createResultCard(result)).join("")} |
|
${ |
|
results.length >= RESULTS_PER_PAGE * page && |
|
RESULTS_PER_PAGE * (page + 1) <= MAX_RESULTS |
|
? `<button |
|
onclick="loadMore()" |
|
class="w-full mt-4 px-6 py-3 bg-gray-100 hover:bg-gray-200 text-gray-700 rounded-lg transition-colors flex items-center gap-2 justify-center" |
|
> |
|
<i data-lucide="more-horizontal"></i> |
|
Load More Results |
|
</button>` |
|
: results.length >= MAX_RESULTS |
|
? `<div class="text-center mt-4 p-6 bg-blue-50 rounded-lg"> |
|
<p class="text-gray-700 mb-3">π You've reached the end of our dataset journey! (${MAX_RESULTS} results)</p> |
|
<p class="text-gray-600 mb-4">Can't find what you're looking for? Why not create and share your own dataset?</p> |
|
<a href="https://huggingface.co/docs/datasets/upload_dataset" |
|
target="_blank" |
|
class="inline-flex items-center gap-2 text-blue-500 hover:text-blue-700"> |
|
<i data-lucide="external-link"></i> |
|
Learn how to share your dataset on Hugging Face |
|
</a> |
|
</div>` |
|
: "" |
|
} |
|
`; |
|
lucide.createIcons(); |
|
} else { |
|
container.innerHTML = ` |
|
<div class="text-center text-gray-500"> |
|
No results found |
|
</div> |
|
`; |
|
} |
|
} |
|
|
|
|
|
function showError(message) { |
|
const errorElement = document.getElementById("errorMessage"); |
|
errorElement.textContent = message; |
|
errorElement.classList.remove("hidden"); |
|
} |
|
|
|
|
|
document |
|
.getElementById("searchInput") |
|
.addEventListener("input", (e) => searchDatasets(e.target.value)); |
|
document |
|
.getElementById("datasetInput") |
|
.addEventListener("keydown", (e) => { |
|
if (e.key === "Enter") findSimilarDatasets(); |
|
}); |
|
|
|
|
|
function findSimilarFromResult(datasetId) { |
|
|
|
switchTab("similar"); |
|
|
|
|
|
const datasetInput = document.getElementById("datasetInput"); |
|
datasetInput.value = datasetId; |
|
|
|
|
|
findSimilarDatasets(); |
|
} |
|
|
|
|
|
function toggleAccordion() { |
|
const content = document.getElementById("accordionContent"); |
|
const icon = document.getElementById("accordionIcon"); |
|
|
|
content.classList.toggle("hidden"); |
|
icon.style.transform = content.classList.contains("hidden") |
|
? "rotate(0deg)" |
|
: "rotate(90deg)"; |
|
} |
|
|
|
|
|
function loadMore() { |
|
currentPage += 1; |
|
const activeTab = document.querySelector(".tab-trigger.active").id; |
|
|
|
if (activeTab === "searchTab") { |
|
const searchQuery = document.getElementById("searchInput").value; |
|
searchDatasets(searchQuery, currentPage); |
|
} else { |
|
findSimilarDatasets(currentPage); |
|
} |
|
} |
|
|
|
|
|
function togglePreview(datasetId) { |
|
const content = document.getElementById(`preview-content-${datasetId}`); |
|
const icon = document.getElementById(`preview-icon-${datasetId}`); |
|
|
|
content.classList.toggle("hidden"); |
|
icon.style.transform = content.classList.contains("hidden") |
|
? "rotate(0deg)" |
|
: "rotate(90deg)"; |
|
} |
|
</script> |
|
</body> |
|
</html> |
|
|