diff --git a/lib/api/archive.ts b/lib/api/archive.ts index e92873f..b081564 100644 --- a/lib/api/archive.ts +++ b/lib/api/archive.ts @@ -2,6 +2,9 @@ import { chromium, devices } from "playwright"; import { prisma } from "@/lib/api/db"; import createFile from "@/lib/api/storage/createFile"; import sendToWayback from "./sendToWayback"; +import { Readability } from "@mozilla/readability"; +import { JSDOM } from "jsdom"; +import DOMPurify from "dompurify"; export default async function archive( linkId: number, @@ -30,13 +33,14 @@ export default async function archive( // if (checkExistingLink) return "A request has already been made."; - const link = await prisma.link.update({ + const targetLink = await prisma.link.update({ where: { id: linkId, }, data: { screenshotPath: user?.archiveAsScreenshot ? "pending" : null, pdfPath: user?.archiveAsPDF ? "pending" : null, + readabilityPath: "pending", }, }); @@ -50,10 +54,43 @@ export default async function archive( try { await page.goto(url, { waitUntil: "domcontentloaded" }); - await page.evaluate( - autoScroll, - Number(process.env.AUTOSCROLL_TIMEOUT) || 30 - ); + await page.goto(url); + const content = await page.content(); + + // Readability + + const window = new JSDOM("").window; + const purify = DOMPurify(window); + const cleanedUpContent = purify.sanitize(content); + + const dom = new JSDOM(cleanedUpContent, { + url: url, + }); + + const article = new Readability(dom.window.document).parse(); + + await prisma.link.update({ + where: { + id: linkId, + }, + data: { + readabilityPath: `archives/${targetLink.collectionId}/${linkId}_readability.txt`, + }, + }); + + await createFile({ + data: JSON.stringify(article), + filePath: `archives/${targetLink.collectionId}/${linkId}_readability.txt`, + }); + + console.log(JSON.parse(JSON.stringify(article))); + + // Screenshot/PDF + + let faulty = true; + await page + .evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30) + .catch((e) => (faulty = false)); const linkExists = await prisma.link.findUnique({ where: { @@ -61,7 +98,7 @@ export default async function archive( }, }); - if (linkExists) { + if (linkExists && faulty) { if (user.archiveAsScreenshot) { const screenshot = await page.screenshot({ fullPage: true, @@ -100,10 +137,21 @@ export default async function archive( : null, }, }); + } else if (faulty) { + await prisma.link.update({ + where: { + id: linkId, + }, + data: { + screenshotPath: null, + pdfPath: null, + }, + }); } await browser.close(); } catch (err) { + console.log(err); await browser.close(); return err; } diff --git a/lib/api/controllers/links/linkId/deleteLinkById.ts b/lib/api/controllers/links/linkId/deleteLinkById.ts index b66b850..3273a5c 100644 --- a/lib/api/controllers/links/linkId/deleteLinkById.ts +++ b/lib/api/controllers/links/linkId/deleteLinkById.ts @@ -31,6 +31,9 @@ export default async function deleteLink(userId: number, linkId: number) { removeFile({ filePath: `archives/${collectionIsAccessible?.id}/${linkId}.png`, }); + removeFile({ + filePath: `archives/${collectionIsAccessible?.id}/${linkId}_readability.txt`, + }); return { response: deleteLink, status: 200 }; } diff --git a/lib/api/controllers/links/linkId/updateLinkById.ts b/lib/api/controllers/links/linkId/updateLinkById.ts index 588978f..0f744b4 100644 --- a/lib/api/controllers/links/linkId/updateLinkById.ts +++ b/lib/api/controllers/links/linkId/updateLinkById.ts @@ -102,6 +102,11 @@ export default async function updateLinkById( `archives/${collectionIsAccessible?.id}/${linkId}.png`, `archives/${data.collection.id}/${linkId}.png` ); + + await moveFile( + `archives/${collectionIsAccessible?.id}/${linkId}_readability.txt`, + `archives/${data.collection.id}/${linkId}_readability.txt` + ); } return { response: updatedLink, status: 200 }; diff --git a/lib/api/storage/readFile.ts b/lib/api/storage/readFile.ts index bf543cf..59e1bef 100644 --- a/lib/api/storage/readFile.ts +++ b/lib/api/storage/readFile.ts @@ -58,6 +58,8 @@ export default async function readFile(filePath: string) { contentType = "application/pdf"; } else if (filePath.endsWith(".png")) { contentType = "image/png"; + } else if (filePath.endsWith("_readability.txt")) { + contentType = "text/plain"; } else { // if (filePath.endsWith(".jpg")) contentType = "image/jpeg"; @@ -83,6 +85,8 @@ export default async function readFile(filePath: string) { contentType = "application/pdf"; } else if (filePath.endsWith(".png")) { contentType = "image/png"; + } else if (filePath.endsWith("_readability.txt")) { + contentType = "text/plain"; } else { // if (filePath.endsWith(".jpg")) contentType = "image/jpeg"; diff --git a/package.json b/package.json index c84eddc..39ae11f 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "@fortawesome/free-solid-svg-icons": "^6.4.0", "@fortawesome/react-fontawesome": "^0.2.0", "@headlessui/react": "^1.7.15", + "@mozilla/readability": "^0.4.4", "@next/font": "13.4.9", "@prisma/client": "^4.16.2", "@stripe/stripe-js": "^1.54.1", @@ -34,6 +35,7 @@ "colorthief": "^2.4.0", "crypto-js": "^4.2.0", "csstype": "^3.1.2", + "dompurify": "^3.0.6", "eslint": "8.46.0", "eslint-config-next": "13.4.9", "framer-motion": "^10.16.4", @@ -57,6 +59,7 @@ "devDependencies": { "@playwright/test": "^1.35.1", "@types/bcrypt": "^5.0.0", + "@types/dompurify": "^3.0.4", "@types/jsdom": "^21.1.3", "autoprefixer": "^10.4.14", "postcss": "^8.4.26", diff --git a/pages/collections/[id].tsx b/pages/collections/[id].tsx index 8967cf1..77b0334 100644 --- a/pages/collections/[id].tsx +++ b/pages/collections/[id].tsx @@ -229,9 +229,11 @@ export default function Index() { {links.some((e) => e.collectionId === Number(router.query.id)) ? (