diff --git a/lib/api/archive.ts b/lib/api/archive.ts index e92873f..b081564 100644 --- a/lib/api/archive.ts +++ b/lib/api/archive.ts @@ -2,6 +2,9 @@ import { chromium, devices } from "playwright"; import { prisma } from "@/lib/api/db"; import createFile from "@/lib/api/storage/createFile"; import sendToWayback from "./sendToWayback"; +import { Readability } from "@mozilla/readability"; +import { JSDOM } from "jsdom"; +import DOMPurify from "dompurify"; export default async function archive( linkId: number, @@ -30,13 +33,14 @@ export default async function archive( // if (checkExistingLink) return "A request has already been made."; - const link = await prisma.link.update({ + const targetLink = await prisma.link.update({ where: { id: linkId, }, data: { screenshotPath: user?.archiveAsScreenshot ? "pending" : null, pdfPath: user?.archiveAsPDF ? "pending" : null, + readabilityPath: "pending", }, }); @@ -50,10 +54,43 @@ export default async function archive( try { await page.goto(url, { waitUntil: "domcontentloaded" }); - await page.evaluate( - autoScroll, - Number(process.env.AUTOSCROLL_TIMEOUT) || 30 - ); + await page.goto(url); + const content = await page.content(); + + // Readability + + const window = new JSDOM("").window; + const purify = DOMPurify(window); + const cleanedUpContent = purify.sanitize(content); + + const dom = new JSDOM(cleanedUpContent, { + url: url, + }); + + const article = new Readability(dom.window.document).parse(); + + await prisma.link.update({ + where: { + id: linkId, + }, + data: { + readabilityPath: `archives/${targetLink.collectionId}/${linkId}_readability.txt`, + }, + }); + + await createFile({ + data: JSON.stringify(article), + filePath: `archives/${targetLink.collectionId}/${linkId}_readability.txt`, + }); + + console.log(JSON.parse(JSON.stringify(article))); + + // Screenshot/PDF + + let faulty = true; + await page + .evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30) + .catch((e) => (faulty = false)); const linkExists = await prisma.link.findUnique({ where: { @@ -61,7 +98,7 @@ export default async function archive( }, }); - if (linkExists) { + if (linkExists && faulty) { if (user.archiveAsScreenshot) { const screenshot = await page.screenshot({ fullPage: true, @@ -100,10 +137,21 @@ export default async function archive( : null, }, }); + } else if (faulty) { + await prisma.link.update({ + where: { + id: linkId, + }, + data: { + screenshotPath: null, + pdfPath: null, + }, + }); } await browser.close(); } catch (err) { + console.log(err); await browser.close(); return err; } diff --git a/lib/api/controllers/links/linkId/deleteLinkById.ts b/lib/api/controllers/links/linkId/deleteLinkById.ts index b66b850..3273a5c 100644 --- a/lib/api/controllers/links/linkId/deleteLinkById.ts +++ b/lib/api/controllers/links/linkId/deleteLinkById.ts @@ -31,6 +31,9 @@ export default async function deleteLink(userId: number, linkId: number) { removeFile({ filePath: `archives/${collectionIsAccessible?.id}/${linkId}.png`, }); + removeFile({ + filePath: `archives/${collectionIsAccessible?.id}/${linkId}_readability.txt`, + }); return { response: deleteLink, status: 200 }; } diff --git a/lib/api/controllers/links/linkId/updateLinkById.ts b/lib/api/controllers/links/linkId/updateLinkById.ts index 588978f..0f744b4 100644 --- a/lib/api/controllers/links/linkId/updateLinkById.ts +++ b/lib/api/controllers/links/linkId/updateLinkById.ts @@ -102,6 +102,11 @@ export default async function updateLinkById( `archives/${collectionIsAccessible?.id}/${linkId}.png`, `archives/${data.collection.id}/${linkId}.png` ); + + await moveFile( + `archives/${collectionIsAccessible?.id}/${linkId}_readability.txt`, + `archives/${data.collection.id}/${linkId}_readability.txt` + ); } return { response: updatedLink, status: 200 }; diff --git a/lib/api/storage/readFile.ts b/lib/api/storage/readFile.ts index bf543cf..59e1bef 100644 --- a/lib/api/storage/readFile.ts +++ b/lib/api/storage/readFile.ts @@ -58,6 +58,8 @@ export default async function readFile(filePath: string) { contentType = "application/pdf"; } else if (filePath.endsWith(".png")) { contentType = "image/png"; + } else if (filePath.endsWith("_readability.txt")) { + contentType = "text/plain"; } else { // if (filePath.endsWith(".jpg")) contentType = "image/jpeg"; @@ -83,6 +85,8 @@ export default async function readFile(filePath: string) { contentType = "application/pdf"; } else if (filePath.endsWith(".png")) { contentType = "image/png"; + } else if (filePath.endsWith("_readability.txt")) { + contentType = "text/plain"; } else { // if (filePath.endsWith(".jpg")) contentType = "image/jpeg"; diff --git a/package.json b/package.json index c84eddc..39ae11f 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "@fortawesome/free-solid-svg-icons": "^6.4.0", "@fortawesome/react-fontawesome": "^0.2.0", "@headlessui/react": "^1.7.15", + "@mozilla/readability": "^0.4.4", "@next/font": "13.4.9", "@prisma/client": "^4.16.2", "@stripe/stripe-js": "^1.54.1", @@ -34,6 +35,7 @@ "colorthief": "^2.4.0", "crypto-js": "^4.2.0", "csstype": "^3.1.2", + "dompurify": "^3.0.6", "eslint": "8.46.0", "eslint-config-next": "13.4.9", "framer-motion": "^10.16.4", @@ -57,6 +59,7 @@ "devDependencies": { "@playwright/test": "^1.35.1", "@types/bcrypt": "^5.0.0", + "@types/dompurify": "^3.0.4", "@types/jsdom": "^21.1.3", "autoprefixer": "^10.4.14", "postcss": "^8.4.26", diff --git a/pages/collections/[id].tsx b/pages/collections/[id].tsx index 8967cf1..77b0334 100644 --- a/pages/collections/[id].tsx +++ b/pages/collections/[id].tsx @@ -229,9 +229,11 @@ export default function Index() { {links.some((e) => e.collectionId === Number(router.query.id)) ? (
- {links.map((e, i) => { - return ; - })} + {links + .filter((e) => e.collection.id === activeCollection?.id) + .map((e, i) => { + return ; + })}
) : ( diff --git a/pages/tags/[id].tsx b/pages/tags/[id].tsx index b8435f5..549ef5f 100644 --- a/pages/tags/[id].tsx +++ b/pages/tags/[id].tsx @@ -191,9 +191,11 @@ export default function Index() {
- {links.map((e, i) => { - return ; - })} + {links + .filter((e) => e.tags.some((e) => e.id === Number(router.query.id))) + .map((e, i) => { + return ; + })}
diff --git a/prisma/migrations/20231029183108_added_readability_path_field_to_the_link_table/migration.sql b/prisma/migrations/20231029183108_added_readability_path_field_to_the_link_table/migration.sql new file mode 100644 index 0000000..bbf2423 --- /dev/null +++ b/prisma/migrations/20231029183108_added_readability_path_field_to_the_link_table/migration.sql @@ -0,0 +1,2 @@ +-- AlterTable +ALTER TABLE "Link" ADD COLUMN "readabilityPath" TEXT; diff --git a/prisma/schema.prisma b/prisma/schema.prisma index d89d2a6..638a12f 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -92,22 +92,23 @@ model UsersAndCollections { } model Link { - id Int @id @default(autoincrement()) - name String - url String - description String @default("") - - pinnedBy User[] - - collection Collection @relation(fields: [collectionId], references: [id]) - collectionId Int - tags Tag[] - - screenshotPath String? - pdfPath String? - - createdAt DateTime @default(now()) - updatedAt DateTime @updatedAt @default(now()) + id Int @id @default(autoincrement()) + name String + url String + description String @default("") + + pinnedBy User[] + + collection Collection @relation(fields: [collectionId], references: [id]) + collectionId Int + tags Tag[] + + screenshotPath String? + pdfPath String? + readabilityPath String? + + createdAt DateTime @default(now()) + updatedAt DateTime @updatedAt @default(now()) } model Tag { diff --git a/yarn.lock b/yarn.lock index ddbf62f..bd41d88 100644 --- a/yarn.lock +++ b/yarn.lock @@ -902,6 +902,11 @@ semver "^7.3.5" tar "^6.1.11" +"@mozilla/readability@^0.4.4": + version "0.4.4" + resolved "https://registry.yarnpkg.com/@mozilla/readability/-/readability-0.4.4.tgz#5d258b3436eba1d5fe31c4386e50a848d5cb5811" + integrity sha512-MCgZyANpJ6msfvVMi6+A0UAsvZj//4OHREYUB9f2087uXHVoU+H+SWhuihvb1beKpM323bReQPRio0WNk2+V6g== + "@next/env@13.4.12": version "13.4.12" resolved "https://registry.yarnpkg.com/@next/env/-/env-13.4.12.tgz#0b88115ab817f178bf9dc0c5e7b367277595b58d" @@ -1490,6 +1495,13 @@ resolved "https://registry.yarnpkg.com/@types/crypto-js/-/crypto-js-4.1.1.tgz#602859584cecc91894eb23a4892f38cfa927890d" integrity sha512-BG7fQKZ689HIoc5h+6D2Dgq1fABRa0RbBWKBd9SP/MVRVXROflpm5fhwyATX5duFmbStzyzyycPB8qUYKDH3NA== +"@types/dompurify@^3.0.4": + version "3.0.4" + resolved "https://registry.yarnpkg.com/@types/dompurify/-/dompurify-3.0.4.tgz#8a6369dec2dd0c397d01751adf3364be035b40d8" + integrity sha512-1Jk8S/IRzNSbwQRbuGuLFHviwxQ8pX81ZEW3INY9432Cwb4VedkBYan8gSIXVLOLHBtimOmUTEYphjRVmo+30g== + dependencies: + "@types/trusted-types" "*" + "@types/jsdom@^21.1.3": version "21.1.3" resolved "https://registry.yarnpkg.com/@types/jsdom/-/jsdom-21.1.3.tgz#a88c5dc65703e1b10b2a7839c12db49662b43ff0" @@ -1559,6 +1571,11 @@ resolved "https://registry.yarnpkg.com/@types/tough-cookie/-/tough-cookie-4.0.3.tgz#3d06b6769518450871fbc40770b7586334bdfd90" integrity sha512-THo502dA5PzG/sfQH+42Lw3fvmYkceefOspdCwpHRul8ik2Jv1K8I5OZz1AT3/rs46kwgMCe9bSBmDLYkkOMGg== +"@types/trusted-types@*": + version "2.0.5" + resolved "https://registry.yarnpkg.com/@types/trusted-types/-/trusted-types-2.0.5.tgz#5cac7e7df3275bb95f79594f192d97da3b4fd5fe" + integrity sha512-I3pkr8j/6tmQtKV/ZzHtuaqYSQvyjGRKH4go60Rr0IDLlFxuRT5V32uvB1mecM5G1EVAUyF/4r4QZ1GHgz+mxA== + "@typescript-eslint/parser@^5.42.0": version "5.49.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/parser/-/parser-5.49.0.tgz#d699734b2f20e16351e117417d34a2bc9d7c4b90" @@ -2290,6 +2307,11 @@ domexception@^4.0.0: dependencies: webidl-conversions "^7.0.0" +dompurify@^3.0.6: + version "3.0.6" + resolved "https://registry.yarnpkg.com/dompurify/-/dompurify-3.0.6.tgz#925ebd576d54a9531b5d76f0a5bef32548351dae" + integrity sha512-ilkD8YEnnGh1zJ240uJsW7AzE+2qpbOUYjacomn3AvJ6J4JhKGSZ2nh4wUIXPZrEPppaCLx5jFe8T89Rk8tQ7w== + ecc-jsbn@~0.1.1: version "0.1.2" resolved "https://registry.yarnpkg.com/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz#3a83a904e54353287874c564b7549386849a98c9"