improved archive logic
This commit is contained in:
parent
375a55dd37
commit
6ba2aab0ba
|
@ -28,7 +28,7 @@ export default function PreservedFormats() {
|
||||||
}, [links]);
|
}, [links]);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
let interval: NodeJS.Timer | undefined;
|
let interval: any;
|
||||||
if (link?.screenshotPath === "pending" || link?.pdfPath === "pending") {
|
if (link?.screenshotPath === "pending" || link?.pdfPath === "pending") {
|
||||||
let isPublicRoute = router.pathname.startsWith("/public")
|
let isPublicRoute = router.pathname.startsWith("/public")
|
||||||
? true
|
? true
|
||||||
|
|
|
@ -81,7 +81,7 @@ export default function Index() {
|
||||||
}, [link]);
|
}, [link]);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
let interval: NodeJS.Timer | undefined;
|
let interval: any;
|
||||||
if (
|
if (
|
||||||
link?.screenshotPath === "pending" ||
|
link?.screenshotPath === "pending" ||
|
||||||
link?.pdfPath === "pending" ||
|
link?.pdfPath === "pending" ||
|
||||||
|
|
|
@ -5,35 +5,39 @@ import sendToWayback from "../../lib/api/sendToWayback";
|
||||||
import { Readability } from "@mozilla/readability";
|
import { Readability } from "@mozilla/readability";
|
||||||
import { JSDOM } from "jsdom";
|
import { JSDOM } from "jsdom";
|
||||||
import DOMPurify from "dompurify";
|
import DOMPurify from "dompurify";
|
||||||
|
import { Collection, Link, User } from "@prisma/client";
|
||||||
|
|
||||||
export default async function urlHandler(
|
type LinksAndCollectionAndOwner = Link & {
|
||||||
linkId: number,
|
collection: Collection & {
|
||||||
url: string,
|
owner: User;
|
||||||
userId: number
|
};
|
||||||
) {
|
};
|
||||||
const user = await prisma.user.findUnique({ where: { id: userId } });
|
|
||||||
|
export default async function urlHandler(link: LinksAndCollectionAndOwner) {
|
||||||
|
const user = link.collection?.owner;
|
||||||
|
|
||||||
const targetLink = await prisma.link.update({
|
const targetLink = await prisma.link.update({
|
||||||
where: { id: linkId },
|
where: { id: link.id },
|
||||||
data: {
|
data: {
|
||||||
screenshotPath: user?.archiveAsScreenshot ? "pending" : null,
|
screenshotPath: user.archiveAsScreenshot ? "pending" : null,
|
||||||
pdfPath: user?.archiveAsPDF ? "pending" : null,
|
pdfPath: user.archiveAsPDF ? "pending" : null,
|
||||||
readabilityPath: "pending",
|
readabilityPath: "pending",
|
||||||
lastPreserved: new Date().toISOString(),
|
lastPreserved: new Date().toISOString(),
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
// Archive.org
|
// archive.org
|
||||||
|
|
||||||
if (user?.archiveAsWaybackMachine) sendToWayback(url);
|
if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
|
||||||
|
|
||||||
if (user?.archiveAsPDF || user?.archiveAsScreenshot) {
|
if (user.archiveAsPDF || user.archiveAsScreenshot) {
|
||||||
const browser = await chromium.launch();
|
const browser = await chromium.launch({ headless: false });
|
||||||
const context = await browser.newContext(devices["Desktop Chrome"]);
|
const context = await browser.newContext(devices["Desktop Chrome"]);
|
||||||
const page = await context.newPage();
|
const page = await context.newPage();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await page.goto(url, { waitUntil: "domcontentloaded" });
|
link.url &&
|
||||||
|
(await page.goto(link.url, { waitUntil: "domcontentloaded" }));
|
||||||
|
|
||||||
const content = await page.content();
|
const content = await page.content();
|
||||||
|
|
||||||
|
@ -48,7 +52,7 @@ export default async function urlHandler(
|
||||||
// console.log(doc);
|
// console.log(doc);
|
||||||
// return createFile({
|
// return createFile({
|
||||||
// data: doc,
|
// data: doc,
|
||||||
// filePath: `archives/${targetLink.collectionId}/${linkId}.mhtml`,
|
// filePath: `archives/${targetLink.collectionId}/${link.id}.mhtml`,
|
||||||
// });
|
// });
|
||||||
// };
|
// };
|
||||||
|
|
||||||
|
@ -59,7 +63,7 @@ export default async function urlHandler(
|
||||||
const window = new JSDOM("").window;
|
const window = new JSDOM("").window;
|
||||||
const purify = DOMPurify(window);
|
const purify = DOMPurify(window);
|
||||||
const cleanedUpContent = purify.sanitize(content);
|
const cleanedUpContent = purify.sanitize(content);
|
||||||
const dom = new JSDOM(cleanedUpContent, { url: url });
|
const dom = new JSDOM(cleanedUpContent, { url: link.url || "" });
|
||||||
const article = new Readability(dom.window.document).parse();
|
const article = new Readability(dom.window.document).parse();
|
||||||
|
|
||||||
const articleText = article?.textContent
|
const articleText = article?.textContent
|
||||||
|
@ -68,13 +72,13 @@ export default async function urlHandler(
|
||||||
|
|
||||||
await createFile({
|
await createFile({
|
||||||
data: JSON.stringify(article),
|
data: JSON.stringify(article),
|
||||||
filePath: `archives/${targetLink.collectionId}/${linkId}_readability.json`,
|
filePath: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
|
||||||
});
|
});
|
||||||
|
|
||||||
await prisma.link.update({
|
await prisma.link.update({
|
||||||
where: { id: linkId },
|
where: { id: link.id },
|
||||||
data: {
|
data: {
|
||||||
readabilityPath: `archives/${targetLink.collectionId}/${linkId}_readability.json`,
|
readabilityPath: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
|
||||||
textContent: articleText,
|
textContent: articleText,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
@ -87,16 +91,20 @@ export default async function urlHandler(
|
||||||
.catch((e) => (faulty = true));
|
.catch((e) => (faulty = true));
|
||||||
|
|
||||||
const linkExists = await prisma.link.findUnique({
|
const linkExists = await prisma.link.findUnique({
|
||||||
where: { id: linkId },
|
where: { id: link.id },
|
||||||
});
|
});
|
||||||
|
|
||||||
if (linkExists && !faulty) {
|
if (linkExists && !faulty) {
|
||||||
|
const processingPromises = [];
|
||||||
|
|
||||||
if (user.archiveAsScreenshot) {
|
if (user.archiveAsScreenshot) {
|
||||||
const screenshot = await page.screenshot({ fullPage: true });
|
const screenshot = await page.screenshot({ fullPage: true });
|
||||||
await createFile({
|
processingPromises.push(
|
||||||
data: screenshot,
|
createFile({
|
||||||
filePath: `archives/${linkExists.collectionId}/${linkId}.png`,
|
data: screenshot,
|
||||||
});
|
filePath: `archives/${linkExists.collectionId}/${link.id}.png`,
|
||||||
|
})
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (user.archiveAsPDF) {
|
if (user.archiveAsPDF) {
|
||||||
|
@ -106,27 +114,30 @@ export default async function urlHandler(
|
||||||
printBackground: true,
|
printBackground: true,
|
||||||
margin: { top: "15px", bottom: "15px" },
|
margin: { top: "15px", bottom: "15px" },
|
||||||
});
|
});
|
||||||
|
processingPromises.push(
|
||||||
await createFile({
|
createFile({
|
||||||
data: pdf,
|
data: pdf,
|
||||||
filePath: `archives/${linkExists.collectionId}/${linkId}.pdf`,
|
filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`,
|
||||||
});
|
})
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
await Promise.allSettled(processingPromises);
|
||||||
|
|
||||||
await prisma.link.update({
|
await prisma.link.update({
|
||||||
where: { id: linkId },
|
where: { id: link.id },
|
||||||
data: {
|
data: {
|
||||||
screenshotPath: user.archiveAsScreenshot
|
screenshotPath: user.archiveAsScreenshot
|
||||||
? `archives/${linkExists.collectionId}/${linkId}.png`
|
? `archives/${linkExists.collectionId}/${link.id}.png`
|
||||||
: null,
|
: null,
|
||||||
pdfPath: user.archiveAsPDF
|
pdfPath: user.archiveAsPDF
|
||||||
? `archives/${linkExists.collectionId}/${linkId}.pdf`
|
? `archives/${linkExists.collectionId}/${link.id}.pdf`
|
||||||
: null,
|
: null,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
} else if (faulty) {
|
} else if (faulty) {
|
||||||
await prisma.link.update({
|
await prisma.link.update({
|
||||||
where: { id: linkId },
|
where: { id: link.id },
|
||||||
data: {
|
data: {
|
||||||
screenshotPath: null,
|
screenshotPath: null,
|
||||||
pdfPath: null,
|
pdfPath: null,
|
||||||
|
|
|
@ -1,13 +1,20 @@
|
||||||
|
import { Collection, Link, User } from "@prisma/client";
|
||||||
import { prisma } from "../lib/api/db";
|
import { prisma } from "../lib/api/db";
|
||||||
import urlHandler from "./lib/urlHandler";
|
import urlHandler from "./lib/urlHandler";
|
||||||
|
|
||||||
const args = process.argv.slice(2).join(" ");
|
const args = process.argv.slice(2).join(" ");
|
||||||
|
|
||||||
const archiveTakeCount = Number(process.env.ARCHIVE_TAKE_COUNT || "") || 1;
|
console.log(process.env.NEXTAUTH_URL);
|
||||||
|
|
||||||
// Function to process links for a given user
|
const archiveTakeCount = Number(process.env.ARCHIVE_TAKE_COUNT || "") || 5;
|
||||||
async function processLinksForUser() {
|
|
||||||
// Fetch the first 'maxLinksPerUser' links for the user
|
type LinksAndCollectionAndOwner = Link & {
|
||||||
|
collection: Collection & {
|
||||||
|
owner: User;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
async function processBatch() {
|
||||||
const links = await prisma.link.findMany({
|
const links = await prisma.link.findMany({
|
||||||
where: {
|
where: {
|
||||||
OR: [
|
OR: [
|
||||||
|
@ -19,6 +26,15 @@ async function processLinksForUser() {
|
||||||
},
|
},
|
||||||
screenshotPath: null,
|
screenshotPath: null,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
collection: {
|
||||||
|
owner: {
|
||||||
|
archiveAsScreenshot: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
screenshotPath: "pending",
|
||||||
|
},
|
||||||
|
///////////////////////
|
||||||
{
|
{
|
||||||
collection: {
|
collection: {
|
||||||
owner: {
|
owner: {
|
||||||
|
@ -27,56 +43,79 @@ async function processLinksForUser() {
|
||||||
},
|
},
|
||||||
pdfPath: null,
|
pdfPath: null,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
collection: {
|
||||||
|
owner: {
|
||||||
|
archiveAsPDF: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pdfPath: "pending",
|
||||||
|
},
|
||||||
|
///////////////////////
|
||||||
{
|
{
|
||||||
readabilityPath: null,
|
readabilityPath: null,
|
||||||
},
|
},
|
||||||
],
|
{
|
||||||
collection: {
|
readabilityPath: "pending",
|
||||||
owner: {
|
|
||||||
archiveAsPDF: true,
|
|
||||||
archiveAsScreenshot: true,
|
|
||||||
},
|
},
|
||||||
},
|
],
|
||||||
},
|
},
|
||||||
take: archiveTakeCount,
|
take: archiveTakeCount,
|
||||||
orderBy: { createdAt: "asc" },
|
orderBy: { createdAt: "asc" },
|
||||||
include: {
|
include: {
|
||||||
collection: true,
|
collection: {
|
||||||
|
include: {
|
||||||
|
owner: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
// Process each link using the urlHandler function
|
const archiveLink = async (link: LinksAndCollectionAndOwner) => {
|
||||||
for (const link of links) {
|
|
||||||
try {
|
try {
|
||||||
console.log(
|
console.log(
|
||||||
`Processing link ${link.id} for user ${link.collection.ownerId}`
|
"\x1b[34m%s\x1b[0m",
|
||||||
|
`Processing link ${link.url} for user ${link.collection.ownerId}`
|
||||||
);
|
);
|
||||||
|
|
||||||
await urlHandler(link.id, link.url || "", link.collection.ownerId);
|
await urlHandler(link);
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
"\x1b[34m%s\x1b[0m",
|
||||||
|
`Succeeded processing link ${link.url} for user ${link.collection.ownerId}.`
|
||||||
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(
|
console.error(
|
||||||
`Error processing link ${link.id} for user ${link.collection.ownerId}:`,
|
"\x1b[34m%s\x1b[0m",
|
||||||
|
`Error processing link ${link.url} for user ${link.collection.ownerId}:`,
|
||||||
error
|
error
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
|
|
||||||
|
// Process each link in the batch concurrently
|
||||||
|
const processingPromises = links.map((e) => archiveLink(e));
|
||||||
|
|
||||||
|
await Promise.allSettled(processingPromises);
|
||||||
}
|
}
|
||||||
|
|
||||||
const intervalInMinutes = 10; // Set the interval for the worker to run
|
const intervalInMinutes = Number(process.env.ARCHIVE_SCRIPT_INTERVAL) || 10;
|
||||||
|
|
||||||
// Main function to iterate over all users and process their links
|
function delay(sec: number) {
|
||||||
async function processLinksForAllUsers() {
|
return new Promise((resolve) => setTimeout(resolve, sec * 1000));
|
||||||
console.log("Starting the link processing task");
|
}
|
||||||
try {
|
|
||||||
const users = await prisma.user.findMany(); // Fetch all users
|
async function init() {
|
||||||
for (const user of users) {
|
console.log("\x1b[34m%s\x1b[0m", "Starting the link processing task");
|
||||||
await processLinksForUser(); // Process links for each user
|
while (true) {
|
||||||
|
try {
|
||||||
|
await processBatch();
|
||||||
|
await delay(intervalInMinutes);
|
||||||
|
} catch (error) {
|
||||||
|
console.error("\x1b[34m%s\x1b[0m", "Error processing links:", error);
|
||||||
|
await delay(intervalInMinutes);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
|
||||||
console.error("Error processing links for users:", error);
|
|
||||||
}
|
}
|
||||||
setTimeout(processLinksForAllUsers, intervalInMinutes * 60000);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initial run
|
init();
|
||||||
processLinksForAllUsers();
|
|
||||||
|
|
Ŝarĝante…
Reference in New Issue