import fs from "node:fs/promises"; import path from "node:path"; import articles from "./articles"; import jsdom from "jsdom"; const DUMPS_LOCATION = "article-dumps"; const PROCESSED_LOCATION = "processed-articles"; export async function scrapeArticlesMainContent() { const promises = articles.map((article) => { return new Promise(async (resolve, reject) => { let text; try { text = await (await fetch(article.url)).text(); } catch (e) { console.log("e:", article.url); reject(`error occurred with this one: ${e}`); return; } resolve(text); }); }); const results = await Promise.allSettled(promises); for (let i = 0; i < results.length; i++) { const result = results[i]; if (result.status === "rejected") { continue; } try { const dom = new jsdom.JSDOM(result.value); fs.writeFile( path.resolve(".", DUMPS_LOCATION, articles[i].slug + ".html"), dom.window.document.body.getElementsByClassName("entries").item(0)?.innerHTML ?? "", ); } catch (e) { console.log("d:", articles[i].url); } } } function deleteChildCommentsRecursive(node: Node) { for (const child of node.childNodes) { // comment node type === 8 === Node.COMMENT_NODE but not available here if (child.nodeType === 8) { child.remove(); } else { deleteChildCommentsRecursive(child); } } } function setAsTitleIfContainsArticle(doc: Document, node: Node) { const el = node as HTMLSpanElement; if (el.innerHTML?.includes("A R T I C L E")) { el.innerHTML = el.innerHTML.replace("A R T I C L E", ""); el.replaceWith(Object.assign(doc.createElement("h1"), { innerHTML: el.innerHTML })); let existingTitle = doc.head.querySelector("title"); if (!existingTitle) { existingTitle = doc.createElement("title"); } doc.head.appendChild(Object.assign(doc.createElement("title"), { innerHTML: el.innerHTML.trim() })); return true; } return false; } type ReplcementEntry = { tag?: string | null; attrs?: Partial>; extra?: (doc: Document, node: Node) => boolean; }; const selectorReplacementMap: Record = { "title": null, "ul": { tag: null, }, "i": { tag: "em", }, "font": { tag: null, }, "p": { tag: "p", }, "div": { tag: null, }, "img": null, "br": null, "wbr": null, "b": { tag: "strong", extra: setAsTitleIfContainsArticle, }, "center": null, "hr": { tag: "hr", }, "table": null, "span.title": { tag: "header", attrs: { className: "title", }, extra: setAsTitleIfContainsArticle, }, "span.posted": { tag: "article", attrs: { className: "posted", }, }, } as const; function forEachTextNode(doc: Document, root: Node, cb: (doc: Document, node: Text) => void) { for (const child of root.childNodes) { // text node type === 3 === Node.TEXT_NODE but not available here if (child.nodeType === 3) { cb(doc, child as Text); } else { forEachTextNode(doc, child, cb); } } } async function cleanupFile(fileName: string) { const filePath = path.resolve(".", DUMPS_LOCATION, fileName); const { window } = new jsdom.JSDOM((await fs.readFile(filePath)).toString()); const document = window.document; for (const selector in selectorReplacementMap) { const replacement = selectorReplacementMap[selector]; for (const node of document.querySelectorAll(selector).values()) { if (replacement) { if (replacement.extra?.(document, node)) { continue; } const newNode = replacement.tag ? document.createElement(replacement.tag) : document.createDocumentFragment(); newNode.replaceChildren(...node.childNodes); Object.assign(newNode, { ...replacement.attrs ?? {} }); node.replaceWith(newNode); } else { node.remove(); } } } forEachTextNode(document, document.documentElement, (doc: Document, node: Text) => { if (node.textContent?.match(/\s*=+\s*/)) { node.replaceWith(doc.createElement("hr")); return true; } else if (node.textContent?.includes("REFERENCES")) { node.replaceWith(Object.assign(doc.createElement("h3"), { innerHTML: node.textContent.trim() })); return true; } return false; }); deleteChildCommentsRecursive(document.documentElement); fs.writeFile( path.resolve(".", PROCESSED_LOCATION, fileName), document.documentElement.outerHTML.replaceAll(/�/g, '"'), ); } async function cleanup() { const promises: Promise[] = []; for (const fileName of await fs.readdir(path.resolve(".", "article-dumps"))) { promises.push(cleanupFile(fileName)); } await Promise.allSettled(promises); } cleanup();