
davanstrien
HF Staff
Update findSimilarFromResult function and improve dataset suggestions handling
6e41f47
<html lang="en"> | |
<head> | |
<meta charset="UTF-8" /> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> | |
<title>Hub Semantic Search</title> | |
<script src="https://cdn.tailwindcss.com"></script> | |
<script src="https://unpkg.com/lucide@latest"></script> | |
<script src="https://cdn.jsdelivr.net/npm/[email protected]/lodash.min.js"></script> | |
</head> | |
<body> | |
<div class="w-full max-w-4xl mx-auto p-4 space-y-8"> | |
<h1 class="text-3xl font-bold text-gray-800">Hub Semantic Search</h1> | |
<div | |
class="bg-gradient-to-br from-blue-50 to-indigo-50 p-6 rounded-xl shadow-sm border border-blue-100 mb-6" | |
> | |
<h2 | |
class="text-lg font-semibold mb-2 text-gray-800 flex items-center gap-2" | |
> | |
<i data-lucide="search" class="text-blue-500"></i> | |
Welcome to Hub Semantic Search | |
</h2> | |
<p class="text-gray-700 mb-2 text-sm"> | |
Find and explore the 🤗 Hub using via semantic search on LLM generated | |
summaries! | |
</p> | |
<div | |
class="bg-blue-100 text-blue-800 px-3 py-1.5 rounded-md mb-2 text-sm" | |
> | |
<p class="flex items-center gap-2"> | |
<i data-lucide="info"></i> Currently supporting dataset search only. | |
Model search coming soon! | |
</p> | |
</div> | |
<button | |
onclick="toggleAccordion()" | |
id="accordionButton" | |
class="text-blue-500 hover:text-blue-700 flex items-center gap-2 text-sm" | |
> | |
<i | |
data-lucide="chevron-right" | |
id="accordionIcon" | |
class="transition-transform" | |
></i> | |
<span>How it works</span> | |
</button> | |
<div id="accordionContent" class="hidden"> | |
<ul | |
class="list-disc list-inside space-y-1 text-gray-600 ml-4 mt-2 text-sm" | |
> | |
<li> | |
<strong>AI-Generated Summaries:</strong> Each dataset is indexed | |
using a concise summary generated by an LLM | |
</li> | |
<li> | |
<strong>Semantic Search:</strong> Find semantically similar | |
resources based on these summaries | |
</li> | |
<li> | |
<strong>Find Similar:</strong> Discover related resources using | |
semantic matching | |
</li> | |
</ul> | |
</div> | |
</div> | |
<div class="tabs w-full"> | |
<div class="tab-list flex gap-2 border-b mb-6"> | |
<button | |
onclick="switchTab('search')" | |
id="searchTab" | |
class="tab-trigger active px-4 sm:px-6 py-3 flex items-center gap-2 border-b-2 border-transparent hover:bg-gray-50 transition-colors flex-1 justify-center" | |
> | |
<i data-lucide="search"></i> Search | |
</button> | |
<button | |
onclick="switchTab('similar')" | |
id="similarTab" | |
class="tab-trigger px-4 sm:px-6 py-3 flex items-center gap-2 border-b-2 border-transparent hover:bg-gray-50 transition-colors flex-1 justify-center" | |
> | |
<i data-lucide="arrow-right"></i> Find Similar | |
</button> | |
</div> | |
<div id="searchContent" class="tab-content space-y-4"> | |
<div | |
class="card bg-white p-8 rounded-xl shadow-sm border border-gray-100" | |
> | |
<p class="text-gray-600 mb-4"> | |
Enter keywords to search through dataset descriptions. The search | |
will automatically update as you type. | |
</p> | |
<div class="relative"> | |
<input | |
type="text" | |
id="searchInput" | |
placeholder="Type to search (minimum 3 characters)..." | |
class="w-full p-3 border rounded-lg pr-10 focus:ring-2 focus:ring-blue-100 focus:border-blue-300 transition-all outline-none" | |
/> | |
<div id="searchLoader" class="hidden absolute right-3 top-2"> | |
<i data-lucide="loader-2" class="animate-spin"></i> | |
</div> | |
</div> | |
</div> | |
</div> | |
<div id="similarContent" class="hidden tab-content space-y-4"> | |
<div | |
class="card bg-white p-8 rounded-xl shadow-sm border border-gray-100" | |
> | |
<p class="text-gray-600 mb-4"> | |
Enter a dataset ID to find similar datasets. Popular datasets will | |
appear as you type. | |
</p> | |
<div class="flex gap-3"> | |
<div class="relative w-full"> | |
<input | |
type="text" | |
id="datasetInput" | |
class="w-full p-3 border border-gray-200 rounded-lg" | |
placeholder="e.g. openai/gsm8k" | |
/> | |
<div | |
id="suggestionsBox" | |
class="hidden absolute w-full mt-1 bg-white border border-gray-200 rounded-lg shadow-lg z-10 max-h-60 overflow-y-auto" | |
></div> | |
</div> | |
<button onclick="findSimilarDatasets()" class="btn-primary"> | |
Find Similar | |
</button> | |
</div> | |
</div> | |
</div> | |
<div | |
id="errorMessage" | |
class="hidden mt-4 p-4 text-red-600 bg-red-50 rounded-md" | |
></div> | |
<div id="resultsContainer" class="mt-6 space-y-4"></div> | |
</div> | |
</div> | |
<style> | |
.tab-trigger.active { | |
border-bottom-color: #3b82f6; | |
color: #3b82f6; | |
} | |
</style> | |
<script> | |
// Configuration | |
const API_URL = | |
"https://davanstrien-huggingface-datasets-search-v2.hf.space"; | |
const MIN_SEARCH_LENGTH = 3; | |
const DEBOUNCE_MS = 300; | |
const RESULTS_PER_PAGE = 5; | |
const MAX_RESULTS = 100; | |
let currentPage = 1; | |
// Initialize Lucide icons | |
lucide.createIcons(); | |
// Tab switching | |
function switchTab(tabId) { | |
currentPage = 1; | |
document | |
.querySelectorAll(".tab-content") | |
.forEach((content) => content.classList.add("hidden")); | |
document | |
.querySelectorAll(".tab-trigger") | |
.forEach((trigger) => trigger.classList.remove("active")); | |
document.getElementById(`${tabId}Content`).classList.remove("hidden"); | |
document.getElementById(`${tabId}Tab`).classList.add("active"); | |
} | |
// Create result card | |
function createResultCard(result) { | |
const cardHtml = ` | |
<div class="card bg-white p-4 sm:p-6 rounded-lg shadow hover:shadow-md transition-shadow"> | |
<div class="space-y-2 w-full"> | |
<div class="flex flex-col sm:flex-row sm:items-center justify-between gap-2"> | |
<div class="flex items-center gap-2"> | |
<i data-lucide="database" class="text-blue-500"></i> | |
<h3 class="text-lg font-semibold">${ | |
result.dataset_id | |
}</h3> | |
</div> | |
<div class="flex flex-wrap items-center gap-2"> | |
<div class="flex items-center gap-4 text-sm text-gray-500"> | |
<span class="flex items-center gap-1"> | |
<i data-lucide="heart" class="w-4 h-4"></i> | |
${result.likes} | |
</span> | |
<span class="flex items-center gap-1"> | |
<i data-lucide="download" class="w-4 h-4"></i> | |
${result.downloads} | |
</span> | |
</div> | |
<span class="bg-blue-50 px-2 py-1 rounded text-sm"> | |
${(result.similarity * 100).toFixed(1)}% match | |
</span> | |
<button | |
onclick="findSimilarFromResult('${ | |
result.dataset_id | |
}')" | |
class="flex items-center gap-1 text-sm text-blue-500 hover:text-blue-700" | |
> | |
<i data-lucide="arrow-right"></i> | |
Find Similar | |
</button> | |
</div> | |
</div> | |
<p class="text-sm text-gray-600">${result.summary}</p> | |
<!-- Add preview section that starts hidden --> | |
<div id="preview-section-${ | |
result.dataset_id | |
}" class="mt-4 border-t pt-4 hidden"> | |
<button | |
onclick="togglePreview('${result.dataset_id}')" | |
class="flex items-center gap-2 text-sm text-gray-600 hover:text-gray-800" | |
> | |
<i data-lucide="chevron-right" id="preview-icon-${ | |
result.dataset_id | |
}" class="transition-transform"></i> | |
Preview Dataset | |
</button> | |
<div id="preview-content-${ | |
result.dataset_id | |
}" class="hidden mt-4"> | |
<iframe | |
src="https://huggingface.co/datasets/${ | |
result.dataset_id | |
}/embed/viewer/default/train" | |
frameborder="0" | |
width="100%" | |
height="560px" | |
></iframe> | |
</div> | |
</div> | |
<a href="https://huggingface.co/datasets/${ | |
result.dataset_id | |
}" | |
target="_blank" | |
class="inline-flex items-center gap-1 text-sm text-blue-500 hover:text-blue-700 mt-2"> | |
<i data-lucide="external-link" class="w-4 h-4"></i> | |
View on Hugging Face Hub | |
</a> | |
</div> | |
</div> | |
`; | |
// After rendering the card, check if preview is available | |
checkDatasetValidity(result.dataset_id); | |
return cardHtml; | |
} | |
// Add function to check dataset validity | |
async function checkDatasetValidity(datasetId) { | |
try { | |
const response = await fetch( | |
`https://datasets-server.huggingface.co/is-valid?dataset=${datasetId}` | |
); | |
const data = await response.json(); | |
// Show preview section only if viewer is available | |
if (data.viewer) { | |
const previewSection = document.getElementById( | |
`preview-section-${datasetId}` | |
); | |
if (previewSection) { | |
previewSection.classList.remove("hidden"); | |
} | |
} | |
} catch (error) { | |
console.error( | |
`Failed to check validity for dataset ${datasetId}:`, | |
error | |
); | |
} | |
} | |
// Search datasets | |
const searchDatasets = _.debounce(async (query, page = 1) => { | |
if (query.length < MIN_SEARCH_LENGTH) { | |
document.getElementById("resultsContainer").innerHTML = ""; | |
return; | |
} | |
document.getElementById("searchLoader").classList.remove("hidden"); | |
document.getElementById("errorMessage").classList.add("hidden"); | |
try { | |
const response = await fetch( | |
`${API_URL}/search/datasets?query=${encodeURIComponent(query)}&k=${ | |
RESULTS_PER_PAGE * page | |
}` | |
); | |
if (!response.ok) throw new Error("Search failed"); | |
const data = await response.json(); | |
console.log("Search results:", data); | |
displayResults(data.results, page); | |
} catch (error) { | |
console.error("Search error:", error); | |
showError("Failed to perform search. Please try again."); | |
} finally { | |
document.getElementById("searchLoader").classList.add("hidden"); | |
} | |
}, DEBOUNCE_MS); | |
// Cache for trending datasets | |
let trendingDatasetsCache = null; | |
let cacheTimestamp = null; | |
const CACHE_DURATION = 1000 * 60 * 15; // 15 minutes | |
async function fetchTrendingDatasets() { | |
if ( | |
trendingDatasetsCache && | |
cacheTimestamp && | |
Date.now() - cacheTimestamp < CACHE_DURATION | |
) { | |
return trendingDatasetsCache; | |
} | |
try { | |
const response = await fetch("https://huggingface.co/api/datasets"); | |
const data = await response.json(); | |
// Just take the first 20 dataset IDs since they're already sorted | |
const trendingDatasets = data | |
.slice(0, 20) | |
.map((dataset) => dataset.id); | |
trendingDatasetsCache = trendingDatasets; | |
cacheTimestamp = Date.now(); | |
return trendingDatasets; | |
} catch (error) { | |
console.error("Error fetching trending datasets:", error); | |
return []; | |
} | |
} | |
function displaySuggestions(datasets, suggestionsBox) { | |
if (datasets.length > 0) { | |
suggestionsBox.innerHTML = datasets | |
.map( | |
(datasetId) => ` | |
<div | |
class="p-3 hover:bg-gray-50 cursor-pointer border-b last:border-b-0" | |
onclick="selectSuggestion('${datasetId}')" | |
> | |
<div class="flex items-center gap-2"> | |
<i data-lucide="database" class="w-4 h-4 text-blue-500"></i> | |
<span>${datasetId}</span> | |
</div> | |
</div> | |
` | |
) | |
.join(""); | |
suggestionsBox.classList.remove("hidden"); | |
lucide.createIcons(); | |
} else { | |
suggestionsBox.classList.add("hidden"); | |
} | |
} | |
function selectSuggestion(dataset) { | |
const datasetInput = document.getElementById("datasetInput"); | |
const suggestionsBox = document.getElementById("suggestionsBox"); | |
datasetInput.value = dataset; | |
suggestionsBox.classList.add("hidden"); | |
findSimilarDatasets(); | |
} | |
// Find similar datasets | |
async function findSimilarDatasets(page = 1) { | |
const datasetId = document.getElementById("datasetInput").value; | |
if (!datasetId) return; | |
const similarLoader = document.getElementById("similarLoader"); | |
if (similarLoader) { | |
similarLoader.classList.remove("hidden"); | |
} | |
document.getElementById("errorMessage").classList.add("hidden"); | |
try { | |
const response = await fetch( | |
`${API_URL}/similarity/datasets?dataset_id=${encodeURIComponent( | |
datasetId | |
)}&k=${RESULTS_PER_PAGE * page}` | |
); | |
if (!response.ok) throw new Error("Similarity search failed"); | |
const data = await response.json(); | |
displayResults(data.results, page); | |
} catch (error) { | |
showError("Failed to find similar datasets. Please try again."); | |
} finally { | |
if (similarLoader) { | |
similarLoader.classList.add("hidden"); | |
} | |
} | |
} | |
// Display results | |
function displayResults(results, page = 1) { | |
const container = document.getElementById("resultsContainer"); | |
console.log("Displaying results:", results); | |
if (results && results.length > 0) { | |
container.innerHTML = ` | |
<div class="flex justify-between items-center"> | |
<h2 class="text-lg font-semibold">Results</h2> | |
<span class="text-sm text-gray-500">Found ${ | |
results.length | |
} results</span> | |
</div> | |
${results.map((result) => createResultCard(result)).join("")} | |
${ | |
results.length >= RESULTS_PER_PAGE * page && | |
RESULTS_PER_PAGE * (page + 1) <= MAX_RESULTS | |
? `<button | |
onclick="loadMore()" | |
class="w-full mt-4 px-6 py-3 bg-gray-100 hover:bg-gray-200 text-gray-700 rounded-lg transition-colors flex items-center gap-2 justify-center" | |
> | |
<i data-lucide="more-horizontal"></i> | |
Load More Results | |
</button>` | |
: results.length >= MAX_RESULTS | |
? `<div class="text-center mt-4 p-6 bg-blue-50 rounded-lg"> | |
<p class="text-gray-700 mb-3">You've reached the end of our dataset journey! (${MAX_RESULTS} results)</p> | |
<p class="text-gray-600 mb-4">Can't find what you're looking for? Why not create and share your own dataset?</p> | |
<a href="https://huggingface.co/docs/datasets/upload_dataset" | |
target="_blank" | |
class="inline-flex items-center gap-2 text-blue-500 hover:text-blue-700"> | |
<i data-lucide="external-link"></i> | |
Learn how to share your dataset on Hugging Face | |
</a> | |
</div>` | |
: "" | |
} | |
`; | |
lucide.createIcons(); | |
} else { | |
container.innerHTML = ` | |
<div class="text-center text-gray-500"> | |
No results found | |
</div> | |
`; | |
} | |
} | |
// Show error message | |
function showError(message) { | |
const errorElement = document.getElementById("errorMessage"); | |
errorElement.textContent = message; | |
errorElement.classList.remove("hidden"); | |
} | |
// Event listeners | |
document | |
.getElementById("searchInput") | |
.addEventListener("input", (e) => searchDatasets(e.target.value)); | |
document | |
.getElementById("datasetInput") | |
.addEventListener("keydown", (e) => { | |
if (e.key === "Enter") findSimilarDatasets(); | |
}); | |
// Update the findSimilarFromResult function | |
function findSimilarFromResult(datasetId) { | |
// Switch to the similar tab | |
switchTab("similar"); | |
// Set the dataset ID in the input without triggering the focus event | |
const datasetInput = document.getElementById("datasetInput"); | |
datasetInput.value = datasetId; | |
// Hide suggestions box explicitly | |
const suggestionsBox = document.getElementById("suggestionsBox"); | |
suggestionsBox.classList.add("hidden"); | |
// Trigger the search | |
findSimilarDatasets(); | |
} | |
// Add accordion functionality | |
function toggleAccordion() { | |
const content = document.getElementById("accordionContent"); | |
const icon = document.getElementById("accordionIcon"); | |
content.classList.toggle("hidden"); | |
icon.style.transform = content.classList.contains("hidden") | |
? "rotate(0deg)" | |
: "rotate(90deg)"; | |
} | |
// Add the loadMore function | |
function loadMore() { | |
currentPage += 1; | |
const activeTab = document.querySelector(".tab-trigger.active").id; | |
if (activeTab === "searchTab") { | |
const searchQuery = document.getElementById("searchInput").value; | |
searchDatasets(searchQuery, currentPage); | |
} else { | |
findSimilarDatasets(currentPage); | |
} | |
} | |
// Add this new function for toggling the preview | |
function togglePreview(datasetId) { | |
const content = document.getElementById(`preview-content-${datasetId}`); | |
const icon = document.getElementById(`preview-icon-${datasetId}`); | |
content.classList.toggle("hidden"); | |
icon.style.transform = content.classList.contains("hidden") | |
? "rotate(0deg)" | |
: "rotate(90deg)"; | |
} | |
// Update the event listeners section | |
document.addEventListener("DOMContentLoaded", () => { | |
const datasetInput = document.getElementById("datasetInput"); | |
let programmaticFocus = false; | |
// Add input event listener for suggestions | |
datasetInput.addEventListener("input", async (e) => { | |
const suggestionsBox = document.getElementById("suggestionsBox"); | |
const value = e.target.value; | |
if (!programmaticFocus) { | |
if (!value) { | |
// Show trending datasets when input is empty | |
const trending = await fetchTrendingDatasets(); | |
displaySuggestions(trending, suggestionsBox); | |
} else { | |
// Filter trending datasets based on input | |
const trending = await fetchTrendingDatasets(); | |
const filtered = trending.filter((dataset) => | |
dataset.toLowerCase().includes(value.toLowerCase()) | |
); | |
displaySuggestions(filtered, suggestionsBox); | |
} | |
} | |
}); | |
// Show trending datasets on focus only when not programmatically focused | |
datasetInput.addEventListener("focus", async () => { | |
if (!programmaticFocus) { | |
const suggestionsBox = document.getElementById("suggestionsBox"); | |
const trending = await fetchTrendingDatasets(); | |
displaySuggestions(trending, suggestionsBox); | |
} | |
programmaticFocus = false; | |
}); | |
}); | |
</script> | |
</body> | |
</html> | |