From 2b8f7d4be20026705fc99db18d9f38c7c0f8a7d3 Mon Sep 17 00:00:00 2001 From: daniel31x13 Date: Tue, 26 Mar 2024 01:38:08 -0400 Subject: [PATCH] code improvements --- .../ModalContent/PreservedFormatsModal.tsx | 131 +++--- lib/api/archiveHandler.ts | 395 ++++++++---------- .../archiveAsReadablility.ts | 44 ++ .../preservationScheme/archiveAsSinglefile.ts | 111 +++++ .../{ => preservationScheme}/sendToWayback.ts | 0 5 files changed, 401 insertions(+), 280 deletions(-) create mode 100644 lib/api/preservationScheme/archiveAsReadablility.ts create mode 100644 lib/api/preservationScheme/archiveAsSinglefile.ts rename lib/api/{ => preservationScheme}/sendToWayback.ts (100%) diff --git a/components/ModalContent/PreservedFormatsModal.tsx b/components/ModalContent/PreservedFormatsModal.tsx index c52c492..e6906a7 100644 --- a/components/ModalContent/PreservedFormatsModal.tsx +++ b/components/ModalContent/PreservedFormatsModal.tsx @@ -18,6 +18,7 @@ import { import PreservedFormatRow from "@/components/PreserverdFormatRow"; import useAccountStore from "@/store/account"; import getPublicUserData from "@/lib/client/getPublicUserData"; +import { BeatLoader } from "react-spinners"; type Props = { onClose: Function; @@ -87,6 +88,15 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) { ); }; + const atLeastOneFormatAvailable = () => { + return ( + screenshotAvailable(link) || + pdfAvailable(link) || + readabilityAvailable(link) || + singlefileAvailable(link) + ); + }; + useEffect(() => { (async () => { const data = await getLink(link.id as number, isPublic); @@ -143,11 +153,10 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
- {isReady() && - (screenshotAvailable(link) || - pdfAvailable(link) || - readabilityAvailable(link) || - singlefileAvailable(link)) ? ( + {screenshotAvailable(link) || + pdfAvailable(link) || + readabilityAvailable(link) || + singlefileAvailable(link) ? (

The following formats are available for this link:

@@ -156,56 +165,57 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) { )}
- {isReady() ? ( - <> - {screenshotAvailable(link) ? ( - - ) : undefined} + {screenshotAvailable(link) ? ( + + ) : undefined} - {pdfAvailable(link) ? ( - - ) : undefined} + {pdfAvailable(link) ? ( + + ) : undefined} - {readabilityAvailable(link) ? ( - - ) : undefined} + {singlefileAvailable(link) ? ( + + ) : undefined} + + {readabilityAvailable(link) ? ( + + ) : undefined} + + {!isReady() && !atLeastOneFormatAvailable() ? ( +
+ - {singlefileAvailable(link) ? ( - - ) : undefined} - - ) : ( -
-

Link preservation is in the queue

@@ -213,7 +223,22 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) { Please check back later to see the result

- )} + ) : !isReady() && atLeastOneFormatAvailable() ? ( +
+ + +

+ There are more preserved formats in the queue +

+

+ Please check back later to see the result +

+
+ ) : undefined}
{ setTimeout( () => @@ -54,9 +31,24 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) { ); }); + // allow user to configure a proxy + let browserOptions: LaunchOptions = {}; + if (process.env.PROXY) { + browserOptions.proxy = { + server: process.env.PROXY, + bypass: process.env.PROXY_BYPASS, + username: process.env.PROXY_USERNAME, + password: process.env.PROXY_PASSWORD, + }; + } + + const browser = await chromium.launch(browserOptions); + try { await Promise.race([ (async () => { + const user = link.collection?.owner; + const validatedUrl = link.url ? await validateUrlSize(link.url) : undefined; @@ -76,12 +68,7 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) { else if (contentType.includes("image/png")) imageExtension = "png"; } - const user = link.collection?.owner; - - // send to archive.org - if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url); - - const targetLink = await prisma.link.update({ + await prisma.link.update({ where: { id: link.id }, data: { type: linkType, @@ -106,6 +93,18 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) { }, }); + // SingleFile + if ( + !link.singlefile?.startsWith("archive") && + !link.singlefile?.startsWith("unavailable") && + user.archiveAsSinglefile && + link.url + ) + await archiveAsSinglefile(link); + + // send to archive.org + if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url); + if (linkType === "image" && !link.image?.startsWith("archive")) { await imageHandler(link, imageExtension); // archive image (jpeg/png) return; @@ -115,230 +114,172 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) { } else if (link.url) { // archive url + const context = await browser.newContext({ + ...devices["Desktop Chrome"], + ignoreHTTPSErrors: process.env.IGNORE_HTTPS_ERRORS === "true", + }); + + const page = await context.newPage(); + await page.goto(link.url, { waitUntil: "domcontentloaded" }); const content = await page.content(); - // Singlefile - if ( - user.archiveAsSinglefile && - !link.singlefile?.startsWith("archive") - ) { - let command = process.env.SINGLEFILE_ARCHIVE_COMMAND; - let httpApi = process.env.SINGLEFILE_ARCHIVE_HTTP_API; - if (command) { - if (command.includes("{{URL}}")) { - try { - let html = execSync(command.replace("{{URL}}", link.url), { - timeout: 60000, - maxBuffer: 1024 * 1024 * 100, - }); - await createFile({ - data: html, - filePath: `archives/${targetLink.collectionId}/${link.id}.html`, - }); - } catch (err) { - console.error( - "Error running SINGLEFILE_ARCHIVE_COMMAND:", - err - ); - } - } else { - console.error( - "Invalid SINGLEFILE_ARCHIVE_COMMAND. Missing {{URL}}" - ); - } - } else if (httpApi) { - try { - let html = await axios.post( - httpApi, - { url: link.url }, - { - headers: { - "Content-Type": "application/x-www-form-urlencoded", - }, - httpAgent: new Agent({ keepAlive: false }), - } - ); - await createFile({ - data: html.data, - filePath: `archives/${targetLink.collectionId}/${link.id}.html`, - }); - } catch (err) { - console.error( - "Error fetching Singlefile using SINGLEFILE_ARCHIVE_HTTP_API:", - err - ); - } - } else { - console.error( - "No SINGLEFILE_ARCHIVE_COMMAND or SINGLEFILE_ARCHIVE_HTTP_API defined." - ); - } - } - // Readability - const window = new JSDOM("").window; - const purify = DOMPurify(window); - const cleanedUpContent = purify.sanitize(content); - const dom = new JSDOM(cleanedUpContent, { url: link.url || "" }); - const article = new Readability(dom.window.document).parse(); - const articleText = article?.textContent - .replace(/ +(?= )/g, "") // strip out multiple spaces - .replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks if ( - articleText && - articleText !== "" && - !link.readable?.startsWith("archive") - ) { - await createFile({ - data: JSON.stringify(article), - filePath: `archives/${targetLink.collectionId}/${link.id}_readability.json`, - }); - - await prisma.link.update({ - where: { id: link.id }, - data: { - readable: `archives/${targetLink.collectionId}/${link.id}_readability.json`, - textContent: articleText, - }, - }); - } + !link.readable?.startsWith("archives") && + !link.readable?.startsWith("unavailable") + ) + await archiveAsReadability(content, link); // Preview - const ogImageUrl = await page.evaluate(() => { - const metaTag = document.querySelector('meta[property="og:image"]'); - return metaTag ? (metaTag as any).content : null; - }); + if ( + !link.preview?.startsWith("archives") && + !link.preview?.startsWith("unavailable") + ) { + const ogImageUrl = await page.evaluate(() => { + const metaTag = document.querySelector( + 'meta[property="og:image"]' + ); + return metaTag ? (metaTag as any).content : null; + }); - createFolder({ - filePath: `archives/preview/${link.collectionId}`, - }); + createFolder({ + filePath: `archives/preview/${link.collectionId}`, + }); - if (ogImageUrl) { - console.log("Found og:image URL:", ogImageUrl); + if (ogImageUrl) { + console.log("Found og:image URL:", ogImageUrl); - // Download the image - const imageResponse = await page.goto(ogImageUrl); + // Download the image + const imageResponse = await page.goto(ogImageUrl); - // Check if imageResponse is not null - if (imageResponse && !link.preview?.startsWith("archive")) { - const buffer = await imageResponse.body(); + // Check if imageResponse is not null + if (imageResponse && !link.preview?.startsWith("archive")) { + const buffer = await imageResponse.body(); - // Check if buffer is not null - if (buffer) { - // Load the image using Jimp - Jimp.read(buffer, async (err, image) => { - if (image && !err) { - image?.resize(1280, Jimp.AUTO).quality(20); - const processedBuffer = await image?.getBufferAsync( - Jimp.MIME_JPEG - ); + // Check if buffer is not null + if (buffer) { + // Load the image using Jimp + Jimp.read(buffer, async (err, image) => { + if (image && !err) { + image?.resize(1280, Jimp.AUTO).quality(20); + const processedBuffer = await image?.getBufferAsync( + Jimp.MIME_JPEG + ); - createFile({ - data: processedBuffer, - filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`, - }).then(() => { - return prisma.link.update({ - where: { id: link.id }, - data: { - preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`, - }, + createFile({ + data: processedBuffer, + filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`, + }).then(() => { + return prisma.link.update({ + where: { id: link.id }, + data: { + preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`, + }, + }); }); - }); - } - }).catch((err) => { - console.error("Error processing the image:", err); - }); - } else { - console.log("No image data found."); + } + }).catch((err) => { + console.error("Error processing the image:", err); + }); + } else { + console.log("No image data found."); + } } - } - await page.goBack(); - } else if (!link.preview?.startsWith("archive")) { - console.log("No og:image found"); - await page - .screenshot({ type: "jpeg", quality: 20 }) - .then((screenshot) => { - return createFile({ - data: screenshot, - filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`, - }); - }) - .then(() => { - return prisma.link.update({ - where: { id: link.id }, - data: { - preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`, - }, - }); - }); - } - - // Screenshot/PDF - await page.evaluate( - autoScroll, - Number(process.env.AUTOSCROLL_TIMEOUT) || 30 - ); - - // Check if the user hasn't deleted the link by the time we're done scrolling - const linkExists = await prisma.link.findUnique({ - where: { id: link.id }, - }); - if (linkExists) { - const processingPromises = []; - - if ( - user.archiveAsScreenshot && - !link.image?.startsWith("archive") - ) { - processingPromises.push( - page.screenshot({ fullPage: true }).then((screenshot) => { + await page.goBack(); + } else if (!link.preview?.startsWith("archive")) { + console.log("No og:image found"); + await page + .screenshot({ type: "jpeg", quality: 20 }) + .then((screenshot) => { return createFile({ data: screenshot, - filePath: `archives/${linkExists.collectionId}/${link.id}.png`, + filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`, }); }) - ); + .then(() => { + return prisma.link.update({ + where: { id: link.id }, + data: { + preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`, + }, + }); + }); } + } - // apply administrator's defined pdf margins or default to 15px - const margins = { - top: process.env.PDF_MARGIN_TOP || "15px", - bottom: process.env.PDF_MARGIN_BOTTOM || "15px", - }; + if ( + (!link.image?.startsWith("archives") && + !link.image?.startsWith("unavailable")) || + (!link.pdf?.startsWith("archives") && + !link.pdf?.startsWith("unavailable")) + ) { + // Screenshot/PDF + await page.evaluate( + autoScroll, + Number(process.env.AUTOSCROLL_TIMEOUT) || 30 + ); - if (user.archiveAsPDF && !link.pdf?.startsWith("archive")) { - processingPromises.push( - page - .pdf({ - width: "1366px", - height: "1931px", - printBackground: true, - margin: margins, - }) - .then((pdf) => { + // Check if the user hasn't deleted the link by the time we're done scrolling + const linkExists = await prisma.link.findUnique({ + where: { id: link.id }, + }); + if (linkExists) { + const processingPromises = []; + + if ( + user.archiveAsScreenshot && + !link.image?.startsWith("archive") + ) { + processingPromises.push( + page.screenshot({ fullPage: true }).then((screenshot) => { return createFile({ - data: pdf, - filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`, + data: screenshot, + filePath: `archives/${linkExists.collectionId}/${link.id}.png`, }); }) - ); + ); + } + + // apply administrator's defined pdf margins or default to 15px + const margins = { + top: process.env.PDF_MARGIN_TOP || "15px", + bottom: process.env.PDF_MARGIN_BOTTOM || "15px", + }; + + if (user.archiveAsPDF && !link.pdf?.startsWith("archive")) { + processingPromises.push( + page + .pdf({ + width: "1366px", + height: "1931px", + printBackground: true, + margin: margins, + }) + .then((pdf) => { + return createFile({ + data: pdf, + filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`, + }); + }) + ); + } + await Promise.allSettled(processingPromises); + await prisma.link.update({ + where: { id: link.id }, + data: { + image: user.archiveAsScreenshot + ? `archives/${linkExists.collectionId}/${link.id}.png` + : undefined, + pdf: user.archiveAsPDF + ? `archives/${linkExists.collectionId}/${link.id}.pdf` + : undefined, + }, + }); } - await Promise.allSettled(processingPromises); - await prisma.link.update({ - where: { id: link.id }, - data: { - image: user.archiveAsScreenshot - ? `archives/${linkExists.collectionId}/${link.id}.png` - : undefined, - pdf: user.archiveAsPDF - ? `archives/${linkExists.collectionId}/${link.id}.pdf` - : undefined, - }, - }); } } })(), diff --git a/lib/api/preservationScheme/archiveAsReadablility.ts b/lib/api/preservationScheme/archiveAsReadablility.ts new file mode 100644 index 0000000..dec27da --- /dev/null +++ b/lib/api/preservationScheme/archiveAsReadablility.ts @@ -0,0 +1,44 @@ +import { Readability } from "@mozilla/readability"; +import { JSDOM } from "jsdom"; +import DOMPurify from "dompurify"; +import { prisma } from "../db"; +import createFile from "../storage/createFile"; +import { Link } from "@prisma/client"; + +const archiveAsReadablility = async (content: string, link: Link) => { + const window = new JSDOM("").window; + const purify = DOMPurify(window); + const cleanedUpContent = purify.sanitize(content); + const dom = new JSDOM(cleanedUpContent, { url: link.url || "" }); + const article = new Readability(dom.window.document).parse(); + const articleText = article?.textContent + .replace(/ +(?= )/g, "") // strip out multiple spaces + .replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks + if ( + articleText && + articleText !== "" && + !link.readable?.startsWith("archive") + ) { + const collectionId = ( + await prisma.link.findUnique({ + where: { id: link.id }, + select: { collectionId: true }, + }) + )?.collectionId; + + await createFile({ + data: JSON.stringify(article), + filePath: `archives/${collectionId}/${link.id}_readability.json`, + }); + + await prisma.link.update({ + where: { id: link.id }, + data: { + readable: `archives/${collectionId}/${link.id}_readability.json`, + textContent: articleText, + }, + }); + } +}; + +export default archiveAsReadablility; diff --git a/lib/api/preservationScheme/archiveAsSinglefile.ts b/lib/api/preservationScheme/archiveAsSinglefile.ts new file mode 100644 index 0000000..0c739b8 --- /dev/null +++ b/lib/api/preservationScheme/archiveAsSinglefile.ts @@ -0,0 +1,111 @@ +import { execSync } from "child_process"; +import createFile from "../storage/createFile"; +import axios from "axios"; +import { Agent } from "http"; +import { prisma } from "../db"; +import { Link } from "@prisma/client"; + +const archiveAsSinglefile = async (link: Link) => { + if (!link.url) return; + + let command = process.env.SINGLEFILE_ARCHIVE_COMMAND; + let httpApi = process.env.SINGLEFILE_ARCHIVE_HTTP_API; + if (command) { + if (command.includes("{{URL}}")) { + try { + let html = execSync(command.replace("{{URL}}", link.url), { + timeout: 120000, + maxBuffer: 1024 * 1024 * 30, + }); + + if (!html.length) { + console.error( + "Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer" + ); + return; + } + + const collectionId = ( + await prisma.link.findUnique({ + where: { id: link.id }, + select: { collectionId: true }, + }) + )?.collectionId; + + if (!collectionId) { + console.error( + "Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found" + ); + return; + } + + await createFile({ + data: html, + filePath: `archives/${collectionId}/${link.id}.html`, + }).then(async () => { + await prisma.link.update({ + where: { id: link.id }, + data: { + singlefile: `archives/${collectionId}/${link.id}.html`, + }, + }); + }); + } catch (err) { + console.error("Error running SINGLEFILE_ARCHIVE_COMMAND:", err); + } + } else { + console.error("Invalid SINGLEFILE_ARCHIVE_COMMAND. Missing {{URL}}"); + } + } else if (httpApi) { + try { + let html = await axios.post( + httpApi, + { url: link.url }, + { + headers: { + "Content-Type": "application/x-www-form-urlencoded", + }, + httpAgent: new Agent({ keepAlive: false }), + } + ); + + if (!html.data.length) { + console.error("Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer"); + return; + } + + const collectionId = ( + await prisma.link.findUnique({ + where: { id: link.id }, + select: { collectionId: true }, + }) + )?.collectionId; + + if (!collectionId) { + console.error( + "Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found" + ); + return; + } + + await createFile({ + data: html.data, + filePath: `archives/${collectionId}/${link.id}.html`, + }).then(async () => { + await prisma.link.update({ + where: { id: link.id }, + data: { + singlefile: `archives/${collectionId}/${link.id}.html`, + }, + }); + }); + } catch (err) { + console.error( + "Error fetching Singlefile using SINGLEFILE_ARCHIVE_HTTP_API:", + err + ); + } + } +}; + +export default archiveAsSinglefile; diff --git a/lib/api/sendToWayback.ts b/lib/api/preservationScheme/sendToWayback.ts similarity index 100% rename from lib/api/sendToWayback.ts rename to lib/api/preservationScheme/sendToWayback.ts