2023-10-18 23:00:23 -05:00
|
|
|
import { chromium, devices } from "playwright";
|
2023-03-25 09:17:34 -05:00
|
|
|
import { prisma } from "@/lib/api/db";
|
2023-07-01 09:11:39 -05:00
|
|
|
import createFile from "@/lib/api/storage/createFile";
|
2023-10-13 01:03:38 -05:00
|
|
|
import sendToWayback from "./sendToWayback";
|
2023-10-29 23:30:45 -05:00
|
|
|
import { Readability } from "@mozilla/readability";
|
|
|
|
import { JSDOM } from "jsdom";
|
|
|
|
import DOMPurify from "dompurify";
|
2023-03-08 15:31:24 -06:00
|
|
|
|
2023-10-18 23:00:23 -05:00
|
|
|
export default async function archive(
|
|
|
|
linkId: number,
|
|
|
|
url: string,
|
|
|
|
userId: number
|
|
|
|
) {
|
2023-10-31 14:44:58 -05:00
|
|
|
const user = await prisma.user.findUnique({ where: { id: userId } });
|
2023-10-28 23:57:24 -05:00
|
|
|
|
2023-10-29 23:30:45 -05:00
|
|
|
const targetLink = await prisma.link.update({
|
2023-10-31 14:44:58 -05:00
|
|
|
where: { id: linkId },
|
2023-10-28 04:57:53 -05:00
|
|
|
data: {
|
2023-10-28 23:57:24 -05:00
|
|
|
screenshotPath: user?.archiveAsScreenshot ? "pending" : null,
|
|
|
|
pdfPath: user?.archiveAsPDF ? "pending" : null,
|
2023-10-29 23:30:45 -05:00
|
|
|
readabilityPath: "pending",
|
2023-10-31 14:44:58 -05:00
|
|
|
lastPreserved: new Date().toISOString(),
|
2023-10-28 04:57:53 -05:00
|
|
|
},
|
|
|
|
});
|
|
|
|
|
2023-10-29 23:50:43 -05:00
|
|
|
// Archive.org
|
|
|
|
|
2023-10-18 23:00:23 -05:00
|
|
|
if (user?.archiveAsWaybackMachine) sendToWayback(url);
|
2023-10-13 01:03:38 -05:00
|
|
|
|
2023-10-18 23:00:23 -05:00
|
|
|
if (user?.archiveAsPDF || user?.archiveAsScreenshot) {
|
|
|
|
const browser = await chromium.launch();
|
|
|
|
const context = await browser.newContext(devices["Desktop Chrome"]);
|
|
|
|
const page = await context.newPage();
|
2023-03-28 02:31:50 -05:00
|
|
|
|
2023-10-18 23:00:23 -05:00
|
|
|
try {
|
|
|
|
await page.goto(url, { waitUntil: "domcontentloaded" });
|
2023-03-28 02:31:50 -05:00
|
|
|
|
2023-10-29 23:30:45 -05:00
|
|
|
const content = await page.content();
|
|
|
|
|
|
|
|
// Readability
|
|
|
|
|
|
|
|
const window = new JSDOM("").window;
|
|
|
|
const purify = DOMPurify(window);
|
|
|
|
const cleanedUpContent = purify.sanitize(content);
|
2023-10-31 14:44:58 -05:00
|
|
|
const dom = new JSDOM(cleanedUpContent, { url: url });
|
|
|
|
const article = new Readability(dom.window.document).parse();
|
2023-10-29 23:30:45 -05:00
|
|
|
|
2023-11-01 05:01:26 -05:00
|
|
|
const articleText = article?.textContent
|
|
|
|
.replace(/ +(?= )/g, "") // strip out multiple spaces
|
|
|
|
.replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks
|
|
|
|
|
2023-10-31 14:44:58 -05:00
|
|
|
await createFile({
|
|
|
|
data: JSON.stringify(article),
|
|
|
|
filePath: `archives/${targetLink.collectionId}/${linkId}_readability.json`,
|
2023-10-29 23:30:45 -05:00
|
|
|
});
|
|
|
|
|
|
|
|
await prisma.link.update({
|
2023-10-31 14:44:58 -05:00
|
|
|
where: { id: linkId },
|
2023-10-29 23:30:45 -05:00
|
|
|
data: {
|
2023-10-29 23:50:43 -05:00
|
|
|
readabilityPath: `archives/${targetLink.collectionId}/${linkId}_readability.json`,
|
2023-11-01 05:01:26 -05:00
|
|
|
textContent: articleText,
|
2023-10-29 23:30:45 -05:00
|
|
|
},
|
|
|
|
});
|
|
|
|
|
|
|
|
// Screenshot/PDF
|
|
|
|
|
2023-10-31 14:44:58 -05:00
|
|
|
let faulty = false;
|
2023-10-29 23:30:45 -05:00
|
|
|
await page
|
|
|
|
.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30)
|
2023-10-31 14:44:58 -05:00
|
|
|
.catch((e) => (faulty = true));
|
2023-03-08 15:31:24 -06:00
|
|
|
|
2023-10-18 23:00:23 -05:00
|
|
|
const linkExists = await prisma.link.findUnique({
|
2023-10-31 14:44:58 -05:00
|
|
|
where: { id: linkId },
|
2023-06-28 21:24:24 -05:00
|
|
|
});
|
|
|
|
|
2023-10-31 14:44:58 -05:00
|
|
|
if (linkExists && !faulty) {
|
2023-10-18 23:00:23 -05:00
|
|
|
if (user.archiveAsScreenshot) {
|
2023-10-31 14:44:58 -05:00
|
|
|
const screenshot = await page.screenshot({ fullPage: true });
|
2023-10-28 04:57:53 -05:00
|
|
|
await createFile({
|
2023-10-18 23:00:23 -05:00
|
|
|
data: screenshot,
|
|
|
|
filePath: `archives/${linkExists.collectionId}/${linkId}.png`,
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
if (user.archiveAsPDF) {
|
|
|
|
const pdf = await page.pdf({
|
|
|
|
width: "1366px",
|
|
|
|
height: "1931px",
|
|
|
|
printBackground: true,
|
|
|
|
margin: { top: "15px", bottom: "15px" },
|
|
|
|
});
|
2023-03-28 02:31:50 -05:00
|
|
|
|
2023-10-28 04:57:53 -05:00
|
|
|
await createFile({
|
2023-10-18 23:00:23 -05:00
|
|
|
data: pdf,
|
|
|
|
filePath: `archives/${linkExists.collectionId}/${linkId}.pdf`,
|
|
|
|
});
|
|
|
|
}
|
2023-10-28 04:57:53 -05:00
|
|
|
|
2023-10-31 14:44:58 -05:00
|
|
|
await prisma.link.update({
|
|
|
|
where: { id: linkId },
|
2023-10-28 04:57:53 -05:00
|
|
|
data: {
|
|
|
|
screenshotPath: user.archiveAsScreenshot
|
|
|
|
? `archives/${linkExists.collectionId}/${linkId}.png`
|
|
|
|
: null,
|
|
|
|
pdfPath: user.archiveAsPDF
|
|
|
|
? `archives/${linkExists.collectionId}/${linkId}.pdf`
|
|
|
|
: null,
|
|
|
|
},
|
|
|
|
});
|
2023-10-29 23:30:45 -05:00
|
|
|
} else if (faulty) {
|
|
|
|
await prisma.link.update({
|
2023-10-31 14:44:58 -05:00
|
|
|
where: { id: linkId },
|
2023-10-29 23:30:45 -05:00
|
|
|
data: {
|
|
|
|
screenshotPath: null,
|
|
|
|
pdfPath: null,
|
|
|
|
},
|
|
|
|
});
|
2023-10-18 23:00:23 -05:00
|
|
|
}
|
|
|
|
} catch (err) {
|
2023-10-29 23:30:45 -05:00
|
|
|
console.log(err);
|
2023-10-31 14:44:58 -05:00
|
|
|
throw err;
|
|
|
|
} finally {
|
2023-10-18 23:00:23 -05:00
|
|
|
await browser.close();
|
|
|
|
}
|
2023-03-25 09:17:34 -05:00
|
|
|
}
|
2023-06-09 17:31:14 -05:00
|
|
|
}
|
2023-03-08 15:31:24 -06:00
|
|
|
|
2023-08-10 11:16:44 -05:00
|
|
|
const autoScroll = async (AUTOSCROLL_TIMEOUT: number) => {
|
|
|
|
const timeoutPromise = new Promise<void>((_, reject) => {
|
|
|
|
setTimeout(() => {
|
2023-10-31 14:44:58 -05:00
|
|
|
reject(new Error(`Webpage was too long to be archived.`));
|
2023-08-10 11:16:44 -05:00
|
|
|
}, AUTOSCROLL_TIMEOUT * 1000);
|
|
|
|
});
|
2023-03-28 02:31:50 -05:00
|
|
|
|
2023-08-10 11:16:44 -05:00
|
|
|
const scrollingPromise = new Promise<void>((resolve) => {
|
|
|
|
let totalHeight = 0;
|
|
|
|
let distance = 100;
|
|
|
|
let scrollDown = setInterval(() => {
|
|
|
|
let scrollHeight = document.body.scrollHeight;
|
|
|
|
window.scrollBy(0, distance);
|
|
|
|
totalHeight += distance;
|
|
|
|
if (totalHeight >= scrollHeight) {
|
|
|
|
clearInterval(scrollDown);
|
|
|
|
window.scroll(0, 0);
|
|
|
|
resolve();
|
|
|
|
}
|
|
|
|
}, 100);
|
2023-03-28 02:31:50 -05:00
|
|
|
});
|
2023-08-10 11:16:44 -05:00
|
|
|
|
|
|
|
await Promise.race([scrollingPromise, timeoutPromise]);
|
2023-03-08 15:31:24 -06:00
|
|
|
};
|