|
import { getRequestHeaders } from '../script.js'; |
|
import { renderExtensionTemplateAsync } from './extensions.js'; |
|
import { POPUP_RESULT, POPUP_TYPE, callGenericPopup } from './popup.js'; |
|
import { SlashCommand } from './slash-commands/SlashCommand.js'; |
|
import { ARGUMENT_TYPE, SlashCommandArgument, SlashCommandNamedArgument } from './slash-commands/SlashCommandArgument.js'; |
|
import { SlashCommandParser } from './slash-commands/SlashCommandParser.js'; |
|
import { isValidUrl } from './utils.js'; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export class ScraperManager { |
|
|
|
|
|
|
|
static #scrapers = []; |
|
|
|
|
|
|
|
|
|
|
|
static async registerDataBankScraper(scraper) { |
|
if (ScraperManager.#scrapers.some(s => s.id === scraper.id)) { |
|
console.warn(`Scraper with ID ${scraper.id} already registered`); |
|
return; |
|
} |
|
|
|
if (scraper.init) { |
|
await scraper.init(); |
|
} |
|
|
|
ScraperManager.#scrapers.push(scraper); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
static getDataBankScrapers() { |
|
return ScraperManager.#scrapers.map(s => ({ id: s.id, name: s.name, description: s.description, iconClass: s.iconClass, iconAvailable: s.iconAvailable })); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
static runDataBankScraper(scraperId) { |
|
const scraper = ScraperManager.#scrapers.find(s => s.id === scraperId); |
|
if (!scraper) { |
|
console.warn(`Scraper with ID ${scraperId} not found`); |
|
return; |
|
} |
|
return scraper.scrape(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
static isScraperAvailable(scraperId) { |
|
const scraper = ScraperManager.#scrapers.find(s => s.id === scraperId); |
|
if (!scraper) { |
|
console.warn(`Scraper with ID ${scraperId} not found`); |
|
return; |
|
} |
|
return scraper.isAvailable(); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
class Notepad { |
|
constructor() { |
|
this.id = 'text'; |
|
this.name = 'Notepad'; |
|
this.description = 'Create a text file from scratch.'; |
|
this.iconClass = 'fa-solid fa-note-sticky'; |
|
this.iconAvailable = true; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
async isAvailable() { |
|
return true; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
async scrape() { |
|
const template = $(await renderExtensionTemplateAsync('attachments', 'notepad', {})); |
|
let fileName = `Untitled - ${new Date().toLocaleString()}`; |
|
let text = ''; |
|
template.find('input[name="notepadFileName"]').val(fileName).on('input', function () { |
|
fileName = String($(this).val()).trim(); |
|
}); |
|
template.find('textarea[name="notepadFileContent"]').on('input', function () { |
|
text = String($(this).val()); |
|
}); |
|
|
|
const result = await callGenericPopup(template, POPUP_TYPE.CONFIRM, '', { wide: true, large: true, okButton: 'Save', cancelButton: 'Cancel' }); |
|
|
|
if (!result || text === '') { |
|
return; |
|
} |
|
|
|
const file = new File([text], `Notepad - ${fileName}.txt`, { type: 'text/plain' }); |
|
return [file]; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
class WebScraper { |
|
constructor() { |
|
this.id = 'web'; |
|
this.name = 'Web'; |
|
this.description = 'Download a page from the web.'; |
|
this.iconClass = 'fa-solid fa-globe'; |
|
this.iconAvailable = true; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
async isAvailable() { |
|
return true; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
async getTitleFromHtmlBlob(blob) { |
|
const text = await blob.text(); |
|
const titleMatch = text.match(/<title>(.*?)<\/title>/i); |
|
return titleMatch ? titleMatch[1] : ''; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
async scrape() { |
|
const template = $(await renderExtensionTemplateAsync('attachments', 'web-scrape', {})); |
|
const linksString = await callGenericPopup(template, POPUP_TYPE.INPUT, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel', rows: 4 }); |
|
|
|
if (!linksString) { |
|
return; |
|
} |
|
|
|
const links = String(linksString).split('\n').map(l => l.trim()).filter(l => l).filter(l => isValidUrl(l)); |
|
|
|
if (links.length === 0) { |
|
toastr.error('Invalid URL'); |
|
return; |
|
} |
|
|
|
const toast = toastr.info('Working, please wait...'); |
|
|
|
const files = []; |
|
|
|
for (const link of links) { |
|
const result = await fetch('/api/search/visit', { |
|
method: 'POST', |
|
headers: getRequestHeaders(), |
|
body: JSON.stringify({ url: link }), |
|
}); |
|
|
|
const blob = await result.blob(); |
|
const domain = new URL(link).hostname; |
|
const timestamp = Date.now(); |
|
const title = await this.getTitleFromHtmlBlob(blob) || 'webpage'; |
|
const file = new File([blob], `${title} - ${domain} - ${timestamp}.html`, { type: 'text/html' }); |
|
files.push(file); |
|
} |
|
|
|
toastr.clear(toast); |
|
return files; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
class FileScraper { |
|
constructor() { |
|
this.id = 'file'; |
|
this.name = 'File'; |
|
this.description = 'Upload a file from your computer.'; |
|
this.iconClass = 'fa-solid fa-upload'; |
|
this.iconAvailable = true; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
async isAvailable() { |
|
return true; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
async scrape() { |
|
return new Promise(resolve => { |
|
const fileInput = document.createElement('input'); |
|
fileInput.type = 'file'; |
|
fileInput.accept = '*/*'; |
|
fileInput.multiple = true; |
|
fileInput.onchange = () => resolve(Array.from(fileInput.files)); |
|
fileInput.click(); |
|
}); |
|
} |
|
} |
|
|
|
class MediaWikiScraper { |
|
constructor() { |
|
this.id = 'mediawiki'; |
|
this.name = 'MediaWiki'; |
|
this.description = 'Download a page from a MediaWiki wiki.'; |
|
this.iconClass = 'fa-brands fa-wikipedia-w'; |
|
this.iconAvailable = true; |
|
} |
|
|
|
async isAvailable() { |
|
try { |
|
const result = await fetch('/api/plugins/fandom/probe-mediawiki', { |
|
method: 'POST', |
|
headers: getRequestHeaders(), |
|
}); |
|
|
|
return result.ok; |
|
} catch (error) { |
|
console.debug('Could not probe Fandom/MediaWiki plugin', error); |
|
return false; |
|
} |
|
} |
|
|
|
async scrape() { |
|
let url = ''; |
|
let filter = ''; |
|
let output = 'single'; |
|
|
|
const template = $(await renderExtensionTemplateAsync('attachments', 'mediawiki-scrape', {})); |
|
template.find('input[name="scrapeInput"]').on('input', function () { |
|
url = String($(this).val()).trim(); |
|
}); |
|
template.find('input[name="scrapeFilter"]').on('input', function () { |
|
filter = String($(this).val()); |
|
}); |
|
template.find('input[name="scrapeOutput"]').on('input', function () { |
|
output = String($(this).val()); |
|
}); |
|
|
|
const confirm = await callGenericPopup(template, POPUP_TYPE.CONFIRM, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel' }); |
|
|
|
if (confirm !== POPUP_RESULT.AFFIRMATIVE) { |
|
return; |
|
} |
|
|
|
if (!url) { |
|
toastr.error('URL name is required'); |
|
return; |
|
} |
|
|
|
const toast = toastr.info('Working, please wait...'); |
|
|
|
const result = await fetch('/api/plugins/fandom/scrape-mediawiki', { |
|
method: 'POST', |
|
headers: getRequestHeaders(), |
|
body: JSON.stringify({ url, filter }), |
|
}); |
|
|
|
if (!result.ok) { |
|
const error = await result.text(); |
|
throw new Error(error); |
|
} |
|
|
|
const data = await result.json(); |
|
toastr.clear(toast); |
|
|
|
if (output === 'multi') { |
|
const files = []; |
|
for (const attachment of data) { |
|
const file = new File([String(attachment.content).trim()], `${String(attachment.title).trim()}.txt`, { type: 'text/plain' }); |
|
files.push(file); |
|
} |
|
return files; |
|
} |
|
|
|
if (output === 'single') { |
|
const combinedContent = data.map((a) => String(a.title).trim() + '\n\n' + String(a.content).trim()).join('\n\n\n\n'); |
|
const file = new File([combinedContent], `${url}.txt`, { type: 'text/plain' }); |
|
return [file]; |
|
} |
|
|
|
return []; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
class FandomScraper { |
|
constructor() { |
|
this.id = 'fandom'; |
|
this.name = 'Fandom'; |
|
this.description = 'Download a page from the Fandom wiki.'; |
|
this.iconClass = 'fa-solid fa-fire'; |
|
this.iconAvailable = true; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
async isAvailable() { |
|
try { |
|
const result = await fetch('/api/plugins/fandom/probe', { |
|
method: 'POST', |
|
headers: getRequestHeaders(), |
|
}); |
|
|
|
return result.ok; |
|
} catch (error) { |
|
console.debug('Could not probe Fandom plugin', error); |
|
return false; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
getFandomId(fandom) { |
|
try { |
|
const url = new URL(fandom); |
|
return url.hostname.split('.')[0] || fandom; |
|
} catch { |
|
return fandom; |
|
} |
|
} |
|
|
|
async scrape() { |
|
let fandom = ''; |
|
let filter = ''; |
|
let output = 'single'; |
|
|
|
const template = $(await renderExtensionTemplateAsync('attachments', 'fandom-scrape', {})); |
|
template.find('input[name="fandomScrapeInput"]').on('input', function () { |
|
fandom = String($(this).val()).trim(); |
|
}); |
|
template.find('input[name="fandomScrapeFilter"]').on('input', function () { |
|
filter = String($(this).val()); |
|
}); |
|
template.find('input[name="fandomScrapeOutput"]').on('input', function () { |
|
output = String($(this).val()); |
|
}); |
|
|
|
const confirm = await callGenericPopup(template, POPUP_TYPE.CONFIRM, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel' }); |
|
|
|
if (confirm !== POPUP_RESULT.AFFIRMATIVE) { |
|
return; |
|
} |
|
|
|
if (!fandom) { |
|
toastr.error('Fandom name is required'); |
|
return; |
|
} |
|
|
|
const toast = toastr.info('Working, please wait...'); |
|
|
|
const result = await fetch('/api/plugins/fandom/scrape', { |
|
method: 'POST', |
|
headers: getRequestHeaders(), |
|
body: JSON.stringify({ fandom, filter }), |
|
}); |
|
|
|
if (!result.ok) { |
|
const error = await result.text(); |
|
throw new Error(error); |
|
} |
|
|
|
const data = await result.json(); |
|
toastr.clear(toast); |
|
|
|
if (output === 'multi') { |
|
const files = []; |
|
for (const attachment of data) { |
|
const file = new File([String(attachment.content).trim()], `${String(attachment.title).trim()}.txt`, { type: 'text/plain' }); |
|
files.push(file); |
|
} |
|
return files; |
|
} |
|
|
|
if (output === 'single') { |
|
const combinedContent = data.map((a) => String(a.title).trim() + '\n\n' + String(a.content).trim()).join('\n\n\n\n'); |
|
const file = new File([combinedContent], `${fandom}.txt`, { type: 'text/plain' }); |
|
return [file]; |
|
} |
|
|
|
return []; |
|
} |
|
} |
|
|
|
const iso6391Codes = [ |
|
'aa', 'ab', 'ae', 'af', 'ak', 'am', 'an', 'ar', 'as', 'av', 'ay', 'az', |
|
'ba', 'be', 'bg', 'bh', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca', 'ce', |
|
'ch', 'co', 'cr', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'dv', 'dz', 'ee', |
|
'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj', 'fo', 'fr', |
|
'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi', 'ho', 'hr', |
|
'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ii', 'ik', 'io', 'is', |
|
'it', 'iu', 'ja', 'jv', 'ka', 'kg', 'ki', 'kj', 'kk', 'kl', 'km', 'kn', |
|
'ko', 'kr', 'ks', 'ku', 'kv', 'kw', 'ky', 'la', 'lb', 'lg', 'li', 'ln', |
|
'lo', 'lt', 'lu', 'lv', 'mg', 'mh', 'mi', 'mk', 'ml', 'mn', 'mr', 'ms', |
|
'mt', 'my', 'na', 'nb', 'nd', 'ne', 'ng', 'nl', 'nn', 'no', 'nr', 'nv', |
|
'ny', 'oc', 'oj', 'om', 'or', 'os', 'pa', 'pi', 'pl', 'ps', 'pt', 'qu', |
|
'rm', 'rn', 'ro', 'ru', 'rw', 'sa', 'sc', 'sd', 'se', 'sg', 'si', 'sk', |
|
'sl', 'sm', 'sn', 'so', 'sq', 'sr', 'ss', 'st', 'su', 'sv', 'sw', 'ta', |
|
'te', 'tg', 'th', 'ti', 'tk', 'tl', 'tn', 'to', 'tr', 'ts', 'tt', 'tw', |
|
'ty', 'ug', 'uk', 'ur', 'uz', 've', 'vi', 'vo', 'wa', 'wo', 'xh', 'yi', |
|
'yo', 'za', 'zh', 'zu']; |
|
|
|
|
|
|
|
|
|
|
|
class YouTubeScraper { |
|
constructor() { |
|
this.id = 'youtube'; |
|
this.name = 'YouTube'; |
|
this.description = 'Download a transcript from a YouTube video.'; |
|
this.iconClass = 'fa-brands fa-youtube'; |
|
this.iconAvailable = true; |
|
} |
|
|
|
async init() { |
|
SlashCommandParser.addCommandObject(SlashCommand.fromProps({ |
|
name: 'yt-script', |
|
callback: async (args, url) => { |
|
try { |
|
if (!url) { |
|
throw new Error('URL or ID of the YouTube video is required'); |
|
} |
|
|
|
const lang = String(args?.lang || ''); |
|
const { transcript } = await this.getScript(String(url).trim(), lang); |
|
return transcript; |
|
} catch (error) { |
|
toastr.error(error.message); |
|
return ''; |
|
} |
|
}, |
|
helpString: 'Scrape a transcript from a YouTube video by ID or URL.', |
|
returns: ARGUMENT_TYPE.STRING, |
|
namedArgumentList: [ |
|
new SlashCommandNamedArgument('lang', 'ISO 639-1 language code of the transcript, e.g. "en"', ARGUMENT_TYPE.STRING, false, false, '', iso6391Codes), |
|
], |
|
unnamedArgumentList: [ |
|
new SlashCommandArgument('URL or ID of the YouTube video', ARGUMENT_TYPE.STRING, true, false), |
|
], |
|
})); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
async isAvailable() { |
|
return true; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
parseId(url) { |
|
|
|
if (/^[a-zA-Z0-9_-]{11}$/.test(url)) { |
|
return url; |
|
} |
|
|
|
const regex = /^.*(?:(?:youtu\.be\/|v\/|vi\/|u\/\w\/|embed\/|shorts\/)|(?:(?:watch)?\?v(?:i)?=|&v(?:i)?=))([^#&?]*).*/; |
|
const match = url.match(regex); |
|
return (match?.length && match[1] ? match[1] : url); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
async scrape() { |
|
let lang = ''; |
|
const template = $(await renderExtensionTemplateAsync('attachments', 'youtube-scrape', {})); |
|
const videoUrl = await callGenericPopup(template, POPUP_TYPE.INPUT, '', { wide: false, large: false, okButton: 'Scrape', cancelButton: 'Cancel', rows: 2 }); |
|
|
|
template.find('input[name="youtubeLanguageCode"]').on('input', function () { |
|
lang = String($(this).val()).trim(); |
|
}); |
|
|
|
if (!videoUrl) { |
|
return; |
|
} |
|
|
|
const toast = toastr.info('Working, please wait...'); |
|
const { transcript, id } = await this.getScript(String(videoUrl), lang); |
|
toastr.clear(toast); |
|
|
|
const file = new File([transcript], `YouTube - ${id} - ${Date.now()}.txt`, { type: 'text/plain' }); |
|
return [file]; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async getScript(videoUrl, lang) { |
|
const id = this.parseId(String(videoUrl).trim()); |
|
|
|
const result = await fetch('/api/search/transcript', { |
|
method: 'POST', |
|
headers: getRequestHeaders(), |
|
body: JSON.stringify({ id, lang }), |
|
}); |
|
|
|
if (!result.ok) { |
|
const error = await result.text(); |
|
throw new Error(error); |
|
} |
|
|
|
const transcript = await result.text(); |
|
return { transcript, id }; |
|
} |
|
} |
|
|
|
export async function initScrapers() { |
|
await ScraperManager.registerDataBankScraper(new FileScraper()); |
|
await ScraperManager.registerDataBankScraper(new Notepad()); |
|
await ScraperManager.registerDataBankScraper(new WebScraper()); |
|
await ScraperManager.registerDataBankScraper(new MediaWikiScraper()); |
|
await ScraperManager.registerDataBankScraper(new FandomScraper()); |
|
await ScraperManager.registerDataBankScraper(new YouTubeScraper()); |
|
} |
|
|