From 3618ba907d670fe23afc01ce93bc7e32928e4da4 Mon Sep 17 00:00:00 2001 From: daniel31x13 Date: Wed, 26 Jun 2024 13:54:03 -0400 Subject: [PATCH] code refactoring --- .../ModalContent/PreservedFormatsModal.tsx | 7 +- lib/api/archiveHandler.ts | 318 ++++++++++-------- lib/api/generatePreview.ts | 1 - .../archiveAsReadablility.ts | 8 +- package.json | 1 + public/locales/en/common.json | 3 +- yarn.lock | 51 ++- 7 files changed, 229 insertions(+), 160 deletions(-) diff --git a/components/ModalContent/PreservedFormatsModal.tsx b/components/ModalContent/PreservedFormatsModal.tsx index 0030250..0f2ef36 100644 --- a/components/ModalContent/PreservedFormatsModal.tsx +++ b/components/ModalContent/PreservedFormatsModal.tsx @@ -124,7 +124,7 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) { clearInterval(interval); } }; - }, [link, getLink, link?.singlefile]); + }, [link?.singlefile]); const updateArchive = async () => { const load = toast.loading(t("sending_request")); @@ -210,10 +210,7 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) { className="mx-auto mb-3" size={20} /> - -

- There are more preserved formats in the queue -

+

{t("there_are_more_formats")}

{t("check_back_later")}

) : undefined} diff --git a/lib/api/archiveHandler.ts b/lib/api/archiveHandler.ts index 862c6a2..cd95db4 100644 --- a/lib/api/archiveHandler.ts +++ b/lib/api/archiveHandler.ts @@ -1,4 +1,4 @@ -import { LaunchOptions, chromium, devices } from "playwright"; +import { LaunchOptions, Page, chromium, devices } from "playwright"; import { prisma } from "./db"; import createFile from "./storage/createFile"; import sendToWayback from "./preservationScheme/sendToWayback"; @@ -9,6 +9,7 @@ import generatePreview from "./generatePreview"; import { removeFiles } from "./manageLinkFiles"; import archiveAsSinglefile from "./preservationScheme/archiveAsSinglefile"; import archiveAsReadability from "./preservationScheme/archiveAsReadablility"; +import shell from "shelljs"; type LinksAndCollectionAndOwner = Link & { collection: Collection & { @@ -50,6 +51,26 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) { const page = await context.newPage(); + // await page.goto("https://github.com", { + // waitUntil: "domcontentloaded", + // }); + + // console.log("Opening page:", link.url); + + // await page.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30); + + // const dom = await page.content(); + + // console.log("The content", dom); + + // shell + // .echo(dom) + // .exec( + // "monolith - -I -b https://marketplace.visualstudio.com/items?itemName=42Crunch.vscode-openapi -j -F -o monolith.html" + // ); + + // console.log("Monolith created!"); + createFolder({ filePath: `archives/preview/${link.collectionId}`, }); @@ -111,13 +132,13 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) { }); // SingleFile - if ( - !link.singlefile?.startsWith("archive") && - !link.singlefile?.startsWith("unavailable") && - user.archiveAsSinglefile && - link.url - ) - await archiveAsSinglefile(link); + // if ( + // !link.singlefile?.startsWith("archive") && + // !link.singlefile?.startsWith("unavailable") && + // user.archiveAsSinglefile && + // link.url + // ) + // await archiveAsSinglefile(link); // send to archive.org if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url); @@ -131,13 +152,6 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) { } else if (link.url) { // archive url - const context = await browser.newContext({ - ...devices["Desktop Chrome"], - ignoreHTTPSErrors: process.env.IGNORE_HTTPS_ERRORS === "true", - }); - - const page = await context.newPage(); - await page.goto(link.url, { waitUntil: "domcontentloaded" }); const content = await page.content(); @@ -150,115 +164,20 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) { await archiveAsReadability(content, link); // Preview + if ( + !link.preview?.startsWith("archives") && + !link.preview?.startsWith("unavailable") + ) + await getArchivePreview(link, page); - const ogImageUrl = await page.evaluate(() => { - const metaTag = document.querySelector('meta[property="og:image"]'); - return metaTag ? (metaTag as any).content : null; - }); - - if (ogImageUrl) { - console.log("Found og:image URL:", ogImageUrl); - - // Download the image - const imageResponse = await page.goto(ogImageUrl); - - // Check if imageResponse is not null - if (imageResponse && !link.preview?.startsWith("archive")) { - const buffer = await imageResponse.body(); - await generatePreview(buffer, link.collectionId, link.id); - } - - await page.goBack(); - } else if (!link.preview?.startsWith("archive")) { - console.log("No og:image found"); - await page - .screenshot({ type: "jpeg", quality: 20 }) - .then((screenshot) => { - return createFile({ - data: screenshot, - filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`, - }); - }) - .then(() => { - return prisma.link.update({ - where: { id: link.id }, - data: { - preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`, - }, - }); - }); - } - } - - if ( - (!link.image?.startsWith("archives") && - !link.image?.startsWith("unavailable")) || - (!link.pdf?.startsWith("archives") && - !link.pdf?.startsWith("unavailable")) - ) { // Screenshot/PDF - await page.evaluate( - autoScroll, - Number(process.env.AUTOSCROLL_TIMEOUT) || 30 - ); - - // Check if the user hasn't deleted the link by the time we're done scrolling - const linkExists = await prisma.link.findUnique({ - where: { id: link.id }, - }); - if (linkExists) { - const processingPromises = []; - - if ( - user.archiveAsScreenshot && - !link.image?.startsWith("archive") - ) { - processingPromises.push( - page.screenshot({ fullPage: true }).then((screenshot) => { - return createFile({ - data: screenshot, - filePath: `archives/${linkExists.collectionId}/${link.id}.png`, - }); - }) - ); - } - - // apply administrator's defined pdf margins or default to 15px - const margins = { - top: process.env.PDF_MARGIN_TOP || "15px", - bottom: process.env.PDF_MARGIN_BOTTOM || "15px", - }; - - if (user.archiveAsPDF && !link.pdf?.startsWith("archive")) { - processingPromises.push( - page - .pdf({ - width: "1366px", - height: "1931px", - printBackground: true, - margin: margins, - }) - .then((pdf) => { - return createFile({ - data: pdf, - filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`, - }); - }) - ); - } - await Promise.allSettled(processingPromises); - await prisma.link.update({ - where: { id: link.id }, - data: { - image: user.archiveAsScreenshot - ? `archives/${linkExists.collectionId}/${link.id}.png` - : undefined, - pdf: user.archiveAsPDF - ? `archives/${linkExists.collectionId}/${link.id}.pdf` - : undefined, - }, - }); - } + if ( + (!link.image?.startsWith("archives") && + !link.image?.startsWith("unavailable")) || + (!link.pdf?.startsWith("archives") && + !link.pdf?.startsWith("unavailable")) + ) + await captureScreenshotAndPdf(link, page, user); } })(), timeoutPromise, @@ -302,31 +221,6 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) { } } -const autoScroll = async (AUTOSCROLL_TIMEOUT: number) => { - const timeoutPromise = new Promise((_, reject) => { - setTimeout(() => { - reject(new Error(`Webpage was too long to be archived.`)); - }, AUTOSCROLL_TIMEOUT * 1000); - }); - - const scrollingPromise = new Promise((resolve) => { - let totalHeight = 0; - let distance = 100; - let scrollDown = setInterval(() => { - let scrollHeight = document.body.scrollHeight; - window.scrollBy(0, distance); - totalHeight += distance; - if (totalHeight >= scrollHeight) { - clearInterval(scrollDown); - window.scroll(0, 0); - resolve(); - } - }, 100); - }); - - await Promise.race([scrollingPromise, timeoutPromise]); -}; - const imageHandler = async ({ url, id }: Link, extension: string) => { const image = await fetch(url as string).then((res) => res.blob()); @@ -374,3 +268,133 @@ const pdfHandler = async ({ url, id }: Link) => { }); } }; + +const getArchivePreview = async ( + link: LinksAndCollectionAndOwner, + page: Page +) => { + const ogImageUrl = await page.evaluate(() => { + const metaTag = document.querySelector('meta[property="og:image"]'); + return metaTag ? (metaTag as any).content : null; + }); + + if (ogImageUrl) { + console.log("Found og:image URL:", ogImageUrl); + + // Download the image + const imageResponse = await page.goto(ogImageUrl); + + // Check if imageResponse is not null + if (imageResponse && !link.preview?.startsWith("archive")) { + const buffer = await imageResponse.body(); + generatePreview(buffer, link.collectionId, link.id); + } + + await page.goBack(); + } else if (!link.preview?.startsWith("archive")) { + console.log("No og:image found"); + await page + .screenshot({ type: "jpeg", quality: 20 }) + .then((screenshot) => { + return createFile({ + data: screenshot, + filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`, + }); + }) + .then(() => { + return prisma.link.update({ + where: { id: link.id }, + data: { + preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`, + }, + }); + }); + } +}; + +const captureScreenshotAndPdf = async ( + link: LinksAndCollectionAndOwner, + page: Page, + user: User +) => { + await page.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30); + + // Check if the user hasn't deleted the link by the time we're done scrolling + const linkExists = await prisma.link.findUnique({ + where: { id: link.id }, + }); + if (linkExists) { + const processingPromises = []; + + if (user.archiveAsScreenshot && !link.image?.startsWith("archive")) { + processingPromises.push( + page.screenshot({ fullPage: true, type: "png" }).then((screenshot) => { + return createFile({ + data: screenshot, + filePath: `archives/${linkExists.collectionId}/${link.id}.png`, + }); + }) + ); + } + + const margins = { + top: process.env.PDF_MARGIN_TOP || "15px", + bottom: process.env.PDF_MARGIN_BOTTOM || "15px", + }; + + if (user.archiveAsPDF && !link.pdf?.startsWith("archive")) { + processingPromises.push( + page + .pdf({ + width: "1366px", + height: "1931px", + printBackground: true, + margin: margins, + }) + .then((pdf) => { + return createFile({ + data: pdf, + filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`, + }); + }) + ); + } + await Promise.allSettled(processingPromises); + await prisma.link.update({ + where: { id: link.id }, + data: { + image: user.archiveAsScreenshot + ? `archives/${linkExists.collectionId}/${link.id}.png` + : undefined, + pdf: user.archiveAsPDF + ? `archives/${linkExists.collectionId}/${link.id}.pdf` + : undefined, + }, + }); + } +}; + +const autoScroll = async (AUTOSCROLL_TIMEOUT: number) => { + const timeoutPromise = new Promise((resolve) => { + setTimeout(() => { + resolve(); + }, AUTOSCROLL_TIMEOUT * 1000); + }); + + const scrollingPromise = new Promise((resolve) => { + let totalHeight = 0; + let distance = 100; + let scrollDown = setInterval(() => { + let scrollHeight = document.body.scrollHeight; + window.scrollBy(0, distance); + totalHeight += distance; + if (totalHeight >= scrollHeight) { + clearInterval(scrollDown); + window.scroll(0, 0); + resolve(); + } + }, 100); + }); + + await Promise.race([scrollingPromise, timeoutPromise]); +}; diff --git a/lib/api/generatePreview.ts b/lib/api/generatePreview.ts index 6e81630..3c2da0f 100644 --- a/lib/api/generatePreview.ts +++ b/lib/api/generatePreview.ts @@ -1,7 +1,6 @@ import Jimp from "jimp"; import { prisma } from "./db"; import createFile from "./storage/createFile"; -import createFolder from "./storage/createFolder"; const generatePreview = async ( buffer: Buffer, diff --git a/lib/api/preservationScheme/archiveAsReadablility.ts b/lib/api/preservationScheme/archiveAsReadablility.ts index dec27da..0f9a5c7 100644 --- a/lib/api/preservationScheme/archiveAsReadablility.ts +++ b/lib/api/preservationScheme/archiveAsReadablility.ts @@ -14,11 +14,9 @@ const archiveAsReadablility = async (content: string, link: Link) => { const articleText = article?.textContent .replace(/ +(?= )/g, "") // strip out multiple spaces .replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks - if ( - articleText && - articleText !== "" && - !link.readable?.startsWith("archive") - ) { + + console.log(articleText); + if (articleText && articleText !== "") { const collectionId = ( await prisma.link.findUnique({ where: { id: link.id }, diff --git a/package.json b/package.json index 0045802..7027122 100644 --- a/package.json +++ b/package.json @@ -70,6 +70,7 @@ "react-masonry-css": "^1.0.16", "react-select": "^5.7.4", "react-spinners": "^0.13.8", + "shelljs": "^0.8.5", "socks-proxy-agent": "^8.0.2", "stripe": "^12.13.0", "tailwind-merge": "^2.3.0", diff --git a/public/locales/en/common.json b/public/locales/en/common.json index 7b6dd57..88f5c33 100644 --- a/public/locales/en/common.json +++ b/public/locales/en/common.json @@ -220,8 +220,9 @@ "github": "GitHub", "twitter": "Twitter", "mastodon": "Mastodon", - "link_preservation_in_queue": "LThe Link preservation is currently in the queue", + "link_preservation_in_queue": "The Link preservation is currently in the queue", "check_back_later": "Please check back later to see the result", + "there_are_more_formats": "There are more preserved formats in the queue", "settings": "Settings", "switch_to": "Switch to {{theme}}", "logout": "Logout", diff --git a/yarn.lock b/yarn.lock index 55074b6..05b76f3 100644 --- a/yarn.lock +++ b/yarn.lock @@ -3514,6 +3514,11 @@ function-bind@^1.1.1: resolved "https://registry.yarnpkg.com/function-bind/-/function-bind-1.1.1.tgz#a56899d3ea3c9bab874bb9773b7c5ede92f4895d" integrity sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A== +function-bind@^1.1.2: + version "1.1.2" + resolved "https://registry.yarnpkg.com/function-bind/-/function-bind-1.1.2.tgz#2c02d864d97f3ea6c8830c464cbd11ab6eab7a1c" + integrity sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA== + function.prototype.name@^1.1.5: version "1.1.5" resolved "https://registry.yarnpkg.com/function.prototype.name/-/function.prototype.name-1.1.5.tgz#cce0505fe1ffb80503e6f9e46cc64e46a12a9621" @@ -3651,7 +3656,7 @@ glob@7.1.7: once "^1.3.0" path-is-absolute "^1.0.0" -glob@^7.1.3: +glob@^7.0.0, glob@^7.1.3: version "7.2.3" resolved "https://registry.yarnpkg.com/glob/-/glob-7.2.3.tgz#b8df0fb802bbfa8e89bd1d938b4e16578ed44f2b" integrity sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q== @@ -3821,6 +3826,13 @@ has@^1.0.3: dependencies: function-bind "^1.1.1" +hasown@^2.0.0: + version "2.0.2" + resolved "https://registry.yarnpkg.com/hasown/-/hasown-2.0.2.tgz#003eaf91be7adc372e84ec59dc37252cedb80003" + integrity sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ== + dependencies: + function-bind "^1.1.2" + hexoid@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/hexoid/-/hexoid-1.0.0.tgz#ad10c6573fb907de23d9ec63a711267d9dc9bc18" @@ -3972,6 +3984,11 @@ internal-slot@^1.0.3, internal-slot@^1.0.4: has "^1.0.3" side-channel "^1.0.4" +interpret@^1.0.0: + version "1.4.0" + resolved "https://registry.yarnpkg.com/interpret/-/interpret-1.4.0.tgz#665ab8bc4da27a774a40584e812e3e0fa45b1a1e" + integrity sha512-agE4QfB2Lkp9uICn7BAqoscw4SZP9kTE2hxiFI3jBPmXJfdqiahTbUuKGsMoN2GtqL9AxhYioAcVvgsb1HvRbA== + invariant@^2.2.4: version "2.2.4" resolved "https://registry.yarnpkg.com/invariant/-/invariant-2.2.4.tgz#610f3c92c9359ce1db616e538008d23ff35158e6" @@ -4053,6 +4070,13 @@ is-core-module@^2.10.0, is-core-module@^2.11.0, is-core-module@^2.9.0: dependencies: has "^1.0.3" +is-core-module@^2.13.0: + version "2.13.1" + resolved "https://registry.yarnpkg.com/is-core-module/-/is-core-module-2.13.1.tgz#ad0d7532c6fea9da1ebdc82742d74525c6273384" + integrity sha512-hHrIjvZsftOsvKSn2TRYl63zvxsgE0K+0mYMoH6gD4omR5IWB2KynivBQczo3+wF1cCkjzvptnI9Q0sPU66ilw== + dependencies: + hasown "^2.0.0" + is-date-object@^1.0.1, is-date-object@^1.0.5: version "1.0.5" resolved "https://registry.yarnpkg.com/is-date-object/-/is-date-object-1.0.5.tgz#0841d5536e724c25597bf6ea62e1bd38298df31f" @@ -5372,6 +5396,13 @@ readdirp@~3.6.0: dependencies: picomatch "^2.2.1" +rechoir@^0.6.2: + version "0.6.2" + resolved "https://registry.yarnpkg.com/rechoir/-/rechoir-0.6.2.tgz#85204b54dba82d5742e28c96756ef43af50e3384" + integrity sha512-HFM8rkZ+i3zrV+4LQjwQ0W+ez98pApMGM3HUrN04j3CqzPOzl9nmP15Y8YXNm8QHGv/eacOVEjqhmWpkRV0NAw== + dependencies: + resolve "^1.1.6" + redux@^4.0.0, redux@^4.0.1: version "4.2.1" resolved "https://registry.yarnpkg.com/redux/-/redux-4.2.1.tgz#c08f4306826c49b5e9dc901dee0452ea8fce6197" @@ -5439,6 +5470,15 @@ resolve-from@^4.0.0: resolved "https://registry.yarnpkg.com/resolve-from/-/resolve-from-4.0.0.tgz#4abcd852ad32dd7baabfe9b40e00a36db5f392e6" integrity sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g== +resolve@^1.1.6: + version "1.22.8" + resolved "https://registry.yarnpkg.com/resolve/-/resolve-1.22.8.tgz#b6c87a9f2aa06dfab52e3d70ac8cde321fa5a48d" + integrity sha512-oKWePCxqpd6FlLvGV1VU0x7bkPmmCNolxzjMf4NczoDnQcIWrAF+cPtZn5i6n+RfD2d9i0tzpKnG6Yk168yIyw== + dependencies: + is-core-module "^2.13.0" + path-parse "^1.0.7" + supports-preserve-symlinks-flag "^1.0.0" + resolve@^1.1.7, resolve@^1.19.0, resolve@^1.22.1, resolve@^1.22.2: version "1.22.2" resolved "https://registry.yarnpkg.com/resolve/-/resolve-1.22.2.tgz#0ed0943d4e301867955766c9f3e1ae6d01c6845f" @@ -5565,6 +5605,15 @@ shell-quote@^1.8.1: resolved "https://registry.yarnpkg.com/shell-quote/-/shell-quote-1.8.1.tgz#6dbf4db75515ad5bac63b4f1894c3a154c766680" integrity sha512-6j1W9l1iAs/4xYBI1SYOVZyFcCis9b4KCLQ8fgAGG07QvzaRLVVRQvAy85yNmmZSjYjg4MWh4gNvlPujU/5LpA== +shelljs@^0.8.5: + version "0.8.5" + resolved "https://registry.yarnpkg.com/shelljs/-/shelljs-0.8.5.tgz#de055408d8361bed66c669d2f000538ced8ee20c" + integrity sha512-TiwcRcrkhHvbrZbnRcFYMLl30Dfov3HKqzp5tO5b4pt6G/SezKcYhmDg15zXVBswHmctSAQKznqNW2LO5tTDow== + dependencies: + glob "^7.0.0" + interpret "^1.0.0" + rechoir "^0.6.2" + side-channel@^1.0.4: version "1.0.4" resolved "https://registry.yarnpkg.com/side-channel/-/side-channel-1.0.4.tgz#efce5c8fdc104ee751b25c58d4290011fa5ea2cf"