djledda.de main
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 

178 řádky
5.3 KiB

  1. import fs from "node:fs/promises";
  2. import path from "node:path";
  3. import articles from "./articles";
  4. import jsdom from "jsdom";
  5. const DUMPS_LOCATION = "article-dumps";
  6. const PROCESSED_LOCATION = "processed-articles";
  7. export async function scrapeArticlesMainContent() {
  8. const promises = articles.map((article) => {
  9. return new Promise<string>(async (resolve, reject) => {
  10. let text;
  11. try {
  12. text = await (await fetch(article.url)).text();
  13. } catch (e) {
  14. console.log("e:", article.url);
  15. reject(`error occurred with this one: ${e}`);
  16. return;
  17. }
  18. resolve(text);
  19. });
  20. });
  21. const results = await Promise.allSettled(promises);
  22. for (let i = 0; i < results.length; i++) {
  23. const result = results[i];
  24. if (result.status === "rejected") {
  25. continue;
  26. }
  27. try {
  28. const dom = new jsdom.JSDOM(result.value);
  29. fs.writeFile(
  30. path.resolve(".", DUMPS_LOCATION, articles[i].slug + ".html"),
  31. dom.window.document.body.getElementsByClassName("entries").item(0)?.innerHTML ?? "",
  32. );
  33. } catch (e) {
  34. console.log("d:", articles[i].url);
  35. }
  36. }
  37. }
  38. function deleteChildCommentsRecursive(node: Node) {
  39. for (const child of node.childNodes) {
  40. // comment node type === 8 === Node.COMMENT_NODE but not available here
  41. if (child.nodeType === 8) {
  42. child.remove();
  43. } else {
  44. deleteChildCommentsRecursive(child);
  45. }
  46. }
  47. }
  48. function setAsTitleIfContainsArticle(doc: Document, node: Node) {
  49. const el = node as HTMLSpanElement;
  50. if (el.innerHTML?.includes("A R T I C L E")) {
  51. el.innerHTML = el.innerHTML.replace("A R T I C L E", "");
  52. el.replaceWith(Object.assign(doc.createElement("h1"), { innerHTML: el.innerHTML }));
  53. let existingTitle = doc.head.querySelector("title");
  54. if (!existingTitle) {
  55. existingTitle = doc.createElement("title");
  56. }
  57. doc.head.appendChild(Object.assign(doc.createElement("title"), { innerHTML: el.innerHTML.trim() }));
  58. return true;
  59. }
  60. return false;
  61. }
  62. type ReplcementEntry = {
  63. tag?: string | null;
  64. attrs?: Partial<Record<string, any>>;
  65. extra?: (doc: Document, node: Node) => boolean;
  66. };
  67. const selectorReplacementMap: Record<string, ReplcementEntry | null> = {
  68. "title": null,
  69. "ul": {
  70. tag: null,
  71. },
  72. "i": {
  73. tag: "em",
  74. },
  75. "font": {
  76. tag: null,
  77. },
  78. "p": {
  79. tag: "p",
  80. },
  81. "div": {
  82. tag: null,
  83. },
  84. "img": null,
  85. "br": null,
  86. "wbr": null,
  87. "b": {
  88. tag: "strong",
  89. extra: setAsTitleIfContainsArticle,
  90. },
  91. "center": null,
  92. "hr": {
  93. tag: "hr",
  94. },
  95. "table": null,
  96. "span.title": {
  97. tag: "header",
  98. attrs: {
  99. className: "title",
  100. },
  101. extra: setAsTitleIfContainsArticle,
  102. },
  103. "span.posted": {
  104. tag: "article",
  105. attrs: {
  106. className: "posted",
  107. },
  108. },
  109. } as const;
  110. function forEachTextNode(doc: Document, root: Node, cb: (doc: Document, node: Text) => void) {
  111. for (const child of root.childNodes) {
  112. // text node type === 3 === Node.TEXT_NODE but not available here
  113. if (child.nodeType === 3) {
  114. cb(doc, child as Text);
  115. } else {
  116. forEachTextNode(doc, child, cb);
  117. }
  118. }
  119. }
  120. async function cleanupFile(fileName: string) {
  121. const filePath = path.resolve(".", DUMPS_LOCATION, fileName);
  122. const { window } = new jsdom.JSDOM((await fs.readFile(filePath)).toString());
  123. const document = window.document;
  124. for (const selector in selectorReplacementMap) {
  125. const replacement = selectorReplacementMap[selector];
  126. for (const node of document.querySelectorAll(selector).values()) {
  127. if (replacement) {
  128. if (replacement.extra?.(document, node)) {
  129. continue;
  130. }
  131. const newNode = replacement.tag
  132. ? document.createElement(replacement.tag)
  133. : document.createDocumentFragment();
  134. newNode.replaceChildren(...node.childNodes);
  135. Object.assign(newNode, { ...replacement.attrs ?? {} });
  136. node.replaceWith(newNode);
  137. } else {
  138. node.remove();
  139. }
  140. }
  141. }
  142. forEachTextNode(document, document.documentElement, (doc: Document, node: Text) => {
  143. if (node.textContent?.match(/\s*=+\s*/)) {
  144. node.replaceWith(doc.createElement("hr"));
  145. return true;
  146. } else if (node.textContent?.includes("REFERENCES")) {
  147. node.replaceWith(Object.assign(doc.createElement("h3"), { innerHTML: node.textContent.trim() }));
  148. return true;
  149. }
  150. return false;
  151. });
  152. deleteChildCommentsRecursive(document.documentElement);
  153. fs.writeFile(
  154. path.resolve(".", PROCESSED_LOCATION, fileName),
  155. document.documentElement.outerHTML.replaceAll(/�/g, '"'),
  156. );
  157. }
  158. async function cleanup() {
  159. const promises: Promise<unknown>[] = [];
  160. for (const fileName of await fs.readdir(path.resolve(".", "article-dumps"))) {
  161. promises.push(cleanupFile(fileName));
  162. }
  163. await Promise.allSettled(promises);
  164. }
  165. cleanup();