|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177 |
- import fs from "node:fs/promises";
- import path from "node:path";
- import articles from "./articles";
- import jsdom from "jsdom";
-
- const DUMPS_LOCATION = "article-dumps";
- const PROCESSED_LOCATION = "processed-articles";
-
- export async function scrapeArticlesMainContent() {
- const promises = articles.map((article) => {
- return new Promise<string>(async (resolve, reject) => {
- let text;
- try {
- text = await (await fetch(article.url)).text();
- } catch (e) {
- console.log("e:", article.url);
- reject(`error occurred with this one: ${e}`);
- return;
- }
- resolve(text);
- });
- });
-
- const results = await Promise.allSettled(promises);
-
- for (let i = 0; i < results.length; i++) {
- const result = results[i];
- if (result.status === "rejected") {
- continue;
- }
- try {
- const dom = new jsdom.JSDOM(result.value);
- fs.writeFile(
- path.resolve(".", DUMPS_LOCATION, articles[i].slug + ".html"),
- dom.window.document.body.getElementsByClassName("entries").item(0)?.innerHTML ?? "",
- );
- } catch (e) {
- console.log("d:", articles[i].url);
- }
- }
- }
-
- function deleteChildCommentsRecursive(node: Node) {
- for (const child of node.childNodes) {
- // comment node type === 8 === Node.COMMENT_NODE but not available here
- if (child.nodeType === 8) {
- child.remove();
- } else {
- deleteChildCommentsRecursive(child);
- }
- }
- }
-
- function setAsTitleIfContainsArticle(doc: Document, node: Node) {
- const el = node as HTMLSpanElement;
- if (el.innerHTML?.includes("A R T I C L E")) {
- el.innerHTML = el.innerHTML.replace("A R T I C L E", "");
- el.replaceWith(Object.assign(doc.createElement("h1"), { innerHTML: el.innerHTML }));
- let existingTitle = doc.head.querySelector("title");
- if (!existingTitle) {
- existingTitle = doc.createElement("title");
- }
- doc.head.appendChild(Object.assign(doc.createElement("title"), { innerHTML: el.innerHTML.trim() }));
- return true;
- }
- return false;
- }
-
- type ReplcementEntry = {
- tag?: string | null;
- attrs?: Partial<Record<string, any>>;
- extra?: (doc: Document, node: Node) => boolean;
- };
-
- const selectorReplacementMap: Record<string, ReplcementEntry | null> = {
- "title": null,
- "ul": {
- tag: null,
- },
- "i": {
- tag: "em",
- },
- "font": {
- tag: null,
- },
- "p": {
- tag: "p",
- },
- "div": {
- tag: null,
- },
- "img": null,
- "br": null,
- "wbr": null,
- "b": {
- tag: "strong",
- extra: setAsTitleIfContainsArticle,
- },
- "center": null,
- "hr": {
- tag: "hr",
- },
- "table": null,
- "span.title": {
- tag: "header",
- attrs: {
- className: "title",
- },
- extra: setAsTitleIfContainsArticle,
- },
- "span.posted": {
- tag: "article",
- attrs: {
- className: "posted",
- },
- },
- } as const;
-
- function forEachTextNode(doc: Document, root: Node, cb: (doc: Document, node: Text) => void) {
- for (const child of root.childNodes) {
- // text node type === 3 === Node.TEXT_NODE but not available here
- if (child.nodeType === 3) {
- cb(doc, child as Text);
- } else {
- forEachTextNode(doc, child, cb);
- }
- }
- }
-
- async function cleanupFile(fileName: string) {
- const filePath = path.resolve(".", DUMPS_LOCATION, fileName);
- const { window } = new jsdom.JSDOM((await fs.readFile(filePath)).toString());
- const document = window.document;
- for (const selector in selectorReplacementMap) {
- const replacement = selectorReplacementMap[selector];
- for (const node of document.querySelectorAll(selector).values()) {
- if (replacement) {
- if (replacement.extra?.(document, node)) {
- continue;
- }
- const newNode = replacement.tag
- ? document.createElement(replacement.tag)
- : document.createDocumentFragment();
- newNode.replaceChildren(...node.childNodes);
- Object.assign(newNode, { ...replacement.attrs ?? {} });
- node.replaceWith(newNode);
- } else {
- node.remove();
- }
- }
- }
- forEachTextNode(document, document.documentElement, (doc: Document, node: Text) => {
- if (node.textContent?.match(/\s*=+\s*/)) {
- node.replaceWith(doc.createElement("hr"));
- return true;
- } else if (node.textContent?.includes("REFERENCES")) {
- node.replaceWith(Object.assign(doc.createElement("h3"), { innerHTML: node.textContent.trim() }));
- return true;
- }
- return false;
- });
- deleteChildCommentsRecursive(document.documentElement);
- fs.writeFile(
- path.resolve(".", PROCESSED_LOCATION, fileName),
- document.documentElement.outerHTML.replaceAll(/�/g, '"'),
- );
- }
-
- async function cleanup() {
- const promises: Promise<unknown>[] = [];
- for (const fileName of await fs.readdir(path.resolve(".", "article-dumps"))) {
- promises.push(cleanupFile(fileName));
- }
- await Promise.allSettled(promises);
- }
-
- cleanup();
|