improved archive logic

This commit is contained in:
daniel31x13 2023-12-11 03:05:47 -05:00
parent 375a55dd37
commit 6ba2aab0ba
4 changed files with 115 additions and 65 deletions

View File

@ -28,7 +28,7 @@ export default function PreservedFormats() {
}, [links]); }, [links]);
useEffect(() => { useEffect(() => {
let interval: NodeJS.Timer | undefined; let interval: any;
if (link?.screenshotPath === "pending" || link?.pdfPath === "pending") { if (link?.screenshotPath === "pending" || link?.pdfPath === "pending") {
let isPublicRoute = router.pathname.startsWith("/public") let isPublicRoute = router.pathname.startsWith("/public")
? true ? true

View File

@ -81,7 +81,7 @@ export default function Index() {
}, [link]); }, [link]);
useEffect(() => { useEffect(() => {
let interval: NodeJS.Timer | undefined; let interval: any;
if ( if (
link?.screenshotPath === "pending" || link?.screenshotPath === "pending" ||
link?.pdfPath === "pending" || link?.pdfPath === "pending" ||

View File

@ -5,35 +5,39 @@ import sendToWayback from "../../lib/api/sendToWayback";
import { Readability } from "@mozilla/readability"; import { Readability } from "@mozilla/readability";
import { JSDOM } from "jsdom"; import { JSDOM } from "jsdom";
import DOMPurify from "dompurify"; import DOMPurify from "dompurify";
import { Collection, Link, User } from "@prisma/client";
export default async function urlHandler( type LinksAndCollectionAndOwner = Link & {
linkId: number, collection: Collection & {
url: string, owner: User;
userId: number };
) { };
const user = await prisma.user.findUnique({ where: { id: userId } });
export default async function urlHandler(link: LinksAndCollectionAndOwner) {
const user = link.collection?.owner;
const targetLink = await prisma.link.update({ const targetLink = await prisma.link.update({
where: { id: linkId }, where: { id: link.id },
data: { data: {
screenshotPath: user?.archiveAsScreenshot ? "pending" : null, screenshotPath: user.archiveAsScreenshot ? "pending" : null,
pdfPath: user?.archiveAsPDF ? "pending" : null, pdfPath: user.archiveAsPDF ? "pending" : null,
readabilityPath: "pending", readabilityPath: "pending",
lastPreserved: new Date().toISOString(), lastPreserved: new Date().toISOString(),
}, },
}); });
// Archive.org // archive.org
if (user?.archiveAsWaybackMachine) sendToWayback(url); if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
if (user?.archiveAsPDF || user?.archiveAsScreenshot) { if (user.archiveAsPDF || user.archiveAsScreenshot) {
const browser = await chromium.launch(); const browser = await chromium.launch({ headless: false });
const context = await browser.newContext(devices["Desktop Chrome"]); const context = await browser.newContext(devices["Desktop Chrome"]);
const page = await context.newPage(); const page = await context.newPage();
try { try {
await page.goto(url, { waitUntil: "domcontentloaded" }); link.url &&
(await page.goto(link.url, { waitUntil: "domcontentloaded" }));
const content = await page.content(); const content = await page.content();
@ -48,7 +52,7 @@ export default async function urlHandler(
// console.log(doc); // console.log(doc);
// return createFile({ // return createFile({
// data: doc, // data: doc,
// filePath: `archives/${targetLink.collectionId}/${linkId}.mhtml`, // filePath: `archives/${targetLink.collectionId}/${link.id}.mhtml`,
// }); // });
// }; // };
@ -59,7 +63,7 @@ export default async function urlHandler(
const window = new JSDOM("").window; const window = new JSDOM("").window;
const purify = DOMPurify(window); const purify = DOMPurify(window);
const cleanedUpContent = purify.sanitize(content); const cleanedUpContent = purify.sanitize(content);
const dom = new JSDOM(cleanedUpContent, { url: url }); const dom = new JSDOM(cleanedUpContent, { url: link.url || "" });
const article = new Readability(dom.window.document).parse(); const article = new Readability(dom.window.document).parse();
const articleText = article?.textContent const articleText = article?.textContent
@ -68,13 +72,13 @@ export default async function urlHandler(
await createFile({ await createFile({
data: JSON.stringify(article), data: JSON.stringify(article),
filePath: `archives/${targetLink.collectionId}/${linkId}_readability.json`, filePath: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
}); });
await prisma.link.update({ await prisma.link.update({
where: { id: linkId }, where: { id: link.id },
data: { data: {
readabilityPath: `archives/${targetLink.collectionId}/${linkId}_readability.json`, readabilityPath: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
textContent: articleText, textContent: articleText,
}, },
}); });
@ -87,16 +91,20 @@ export default async function urlHandler(
.catch((e) => (faulty = true)); .catch((e) => (faulty = true));
const linkExists = await prisma.link.findUnique({ const linkExists = await prisma.link.findUnique({
where: { id: linkId }, where: { id: link.id },
}); });
if (linkExists && !faulty) { if (linkExists && !faulty) {
const processingPromises = [];
if (user.archiveAsScreenshot) { if (user.archiveAsScreenshot) {
const screenshot = await page.screenshot({ fullPage: true }); const screenshot = await page.screenshot({ fullPage: true });
await createFile({ processingPromises.push(
data: screenshot, createFile({
filePath: `archives/${linkExists.collectionId}/${linkId}.png`, data: screenshot,
}); filePath: `archives/${linkExists.collectionId}/${link.id}.png`,
})
);
} }
if (user.archiveAsPDF) { if (user.archiveAsPDF) {
@ -106,27 +114,30 @@ export default async function urlHandler(
printBackground: true, printBackground: true,
margin: { top: "15px", bottom: "15px" }, margin: { top: "15px", bottom: "15px" },
}); });
processingPromises.push(
await createFile({ createFile({
data: pdf, data: pdf,
filePath: `archives/${linkExists.collectionId}/${linkId}.pdf`, filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`,
}); })
);
} }
await Promise.allSettled(processingPromises);
await prisma.link.update({ await prisma.link.update({
where: { id: linkId }, where: { id: link.id },
data: { data: {
screenshotPath: user.archiveAsScreenshot screenshotPath: user.archiveAsScreenshot
? `archives/${linkExists.collectionId}/${linkId}.png` ? `archives/${linkExists.collectionId}/${link.id}.png`
: null, : null,
pdfPath: user.archiveAsPDF pdfPath: user.archiveAsPDF
? `archives/${linkExists.collectionId}/${linkId}.pdf` ? `archives/${linkExists.collectionId}/${link.id}.pdf`
: null, : null,
}, },
}); });
} else if (faulty) { } else if (faulty) {
await prisma.link.update({ await prisma.link.update({
where: { id: linkId }, where: { id: link.id },
data: { data: {
screenshotPath: null, screenshotPath: null,
pdfPath: null, pdfPath: null,

View File

@ -1,13 +1,20 @@
import { Collection, Link, User } from "@prisma/client";
import { prisma } from "../lib/api/db"; import { prisma } from "../lib/api/db";
import urlHandler from "./lib/urlHandler"; import urlHandler from "./lib/urlHandler";
const args = process.argv.slice(2).join(" "); const args = process.argv.slice(2).join(" ");
const archiveTakeCount = Number(process.env.ARCHIVE_TAKE_COUNT || "") || 1; console.log(process.env.NEXTAUTH_URL);
// Function to process links for a given user const archiveTakeCount = Number(process.env.ARCHIVE_TAKE_COUNT || "") || 5;
async function processLinksForUser() {
// Fetch the first 'maxLinksPerUser' links for the user type LinksAndCollectionAndOwner = Link & {
collection: Collection & {
owner: User;
};
};
async function processBatch() {
const links = await prisma.link.findMany({ const links = await prisma.link.findMany({
where: { where: {
OR: [ OR: [
@ -19,6 +26,15 @@ async function processLinksForUser() {
}, },
screenshotPath: null, screenshotPath: null,
}, },
{
collection: {
owner: {
archiveAsScreenshot: true,
},
},
screenshotPath: "pending",
},
///////////////////////
{ {
collection: { collection: {
owner: { owner: {
@ -27,56 +43,79 @@ async function processLinksForUser() {
}, },
pdfPath: null, pdfPath: null,
}, },
{
collection: {
owner: {
archiveAsPDF: true,
},
},
pdfPath: "pending",
},
///////////////////////
{ {
readabilityPath: null, readabilityPath: null,
}, },
], {
collection: { readabilityPath: "pending",
owner: {
archiveAsPDF: true,
archiveAsScreenshot: true,
}, },
}, ],
}, },
take: archiveTakeCount, take: archiveTakeCount,
orderBy: { createdAt: "asc" }, orderBy: { createdAt: "asc" },
include: { include: {
collection: true, collection: {
include: {
owner: true,
},
},
}, },
}); });
// Process each link using the urlHandler function const archiveLink = async (link: LinksAndCollectionAndOwner) => {
for (const link of links) {
try { try {
console.log( console.log(
`Processing link ${link.id} for user ${link.collection.ownerId}` "\x1b[34m%s\x1b[0m",
`Processing link ${link.url} for user ${link.collection.ownerId}`
); );
await urlHandler(link.id, link.url || "", link.collection.ownerId); await urlHandler(link);
console.log(
"\x1b[34m%s\x1b[0m",
`Succeeded processing link ${link.url} for user ${link.collection.ownerId}.`
);
} catch (error) { } catch (error) {
console.error( console.error(
`Error processing link ${link.id} for user ${link.collection.ownerId}:`, "\x1b[34m%s\x1b[0m",
`Error processing link ${link.url} for user ${link.collection.ownerId}:`,
error error
); );
} }
} };
// Process each link in the batch concurrently
const processingPromises = links.map((e) => archiveLink(e));
await Promise.allSettled(processingPromises);
} }
const intervalInMinutes = 10; // Set the interval for the worker to run const intervalInMinutes = Number(process.env.ARCHIVE_SCRIPT_INTERVAL) || 10;
// Main function to iterate over all users and process their links function delay(sec: number) {
async function processLinksForAllUsers() { return new Promise((resolve) => setTimeout(resolve, sec * 1000));
console.log("Starting the link processing task"); }
try {
const users = await prisma.user.findMany(); // Fetch all users async function init() {
for (const user of users) { console.log("\x1b[34m%s\x1b[0m", "Starting the link processing task");
await processLinksForUser(); // Process links for each user while (true) {
try {
await processBatch();
await delay(intervalInMinutes);
} catch (error) {
console.error("\x1b[34m%s\x1b[0m", "Error processing links:", error);
await delay(intervalInMinutes);
} }
} catch (error) {
console.error("Error processing links for users:", error);
} }
setTimeout(processLinksForAllUsers, intervalInMinutes * 60000);
} }
// Initial run init();
processLinksForAllUsers();