el.xwx.moe/lib/api/archiveHandler.ts

252 lines
7.4 KiB
TypeScript
Raw Normal View History

2024-06-26 12:54:03 -05:00
import { LaunchOptions, Page, chromium, devices } from "playwright";
import { prisma } from "./db";
import createFile from "./storage/createFile";
2024-03-26 00:38:08 -05:00
import sendToWayback from "./preservationScheme/sendToWayback";
import { Collection, Link, User } from "@prisma/client";
import validateUrlSize from "./validateUrlSize";
import createFolder from "./storage/createFolder";
import generatePreview from "./generatePreview";
2024-04-08 18:35:06 -05:00
import { removeFiles } from "./manageLinkFiles";
2024-06-27 11:39:03 -05:00
import handleMonolith from "./preservationScheme/handleMonolith";
import handleReadablility from "./preservationScheme/handleReadablility";
import handleArchivePreview from "./preservationScheme/handleArchivePreview";
import handleScreenshotAndPdf from "./preservationScheme/handleScreenshotAndPdf";
type LinksAndCollectionAndOwner = Link & {
collection: Collection & {
owner: User;
};
};
const BROWSER_TIMEOUT = Number(process.env.BROWSER_TIMEOUT) || 5;
export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
2024-03-26 00:38:08 -05:00
const timeoutPromise = new Promise((_, reject) => {
setTimeout(
() =>
reject(
new Error(
`Browser has been open for more than ${BROWSER_TIMEOUT} minutes.`
)
),
BROWSER_TIMEOUT * 60000
);
});
// allow user to configure a proxy
let browserOptions: LaunchOptions = {};
2024-02-19 14:38:36 -06:00
if (process.env.PROXY) {
browserOptions.proxy = {
2024-02-19 14:38:36 -06:00
server: process.env.PROXY,
bypass: process.env.PROXY_BYPASS,
username: process.env.PROXY_USERNAME,
password: process.env.PROXY_PASSWORD,
};
}
const browser = await chromium.launch(browserOptions);
const context = await browser.newContext({
...devices["Desktop Chrome"],
2024-02-10 18:34:25 -06:00
ignoreHTTPSErrors: process.env.IGNORE_HTTPS_ERRORS === "true",
});
2024-02-18 16:02:35 -06:00
const page = await context.newPage();
2024-04-08 18:35:06 -05:00
createFolder({
filePath: `archives/preview/${link.collectionId}`,
});
createFolder({
filePath: `archives/${link.collectionId}`,
});
try {
await Promise.race([
(async () => {
2024-03-26 00:38:08 -05:00
const user = link.collection?.owner;
const validatedUrl = link.url
? await validateUrlSize(link.url)
: undefined;
2024-04-17 17:18:50 -05:00
if (
validatedUrl === null &&
process.env.IGNORE_URL_SIZE_LIMIT !== "true"
)
2024-01-17 09:30:35 -06:00
throw "Something went wrong while retrieving the file size.";
const contentType = validatedUrl?.get("content-type");
let linkType = "url";
let imageExtension = "png";
if (!link.url) linkType = link.type;
2024-01-01 09:37:20 -06:00
else if (contentType?.includes("application/pdf")) linkType = "pdf";
else if (contentType?.startsWith("image")) {
linkType = "image";
2024-01-01 09:37:20 -06:00
if (contentType.includes("image/jpeg")) imageExtension = "jpeg";
else if (contentType.includes("image/png")) imageExtension = "png";
}
2024-03-26 00:38:08 -05:00
await prisma.link.update({
where: { id: link.id },
data: {
type: linkType,
image:
user.archiveAsScreenshot && !link.image?.startsWith("archive")
? "pending"
: "unavailable",
pdf:
user.archiveAsPDF && !link.pdf?.startsWith("archive")
? "pending"
: "unavailable",
readable: !link.readable?.startsWith("archive")
? "pending"
: undefined,
2024-03-15 13:41:41 -05:00
singlefile: !link.singlefile?.startsWith("archive")
? "pending"
: undefined,
preview: !link.readable?.startsWith("archive")
? "pending"
: undefined,
lastPreserved: new Date().toISOString(),
},
});
2023-12-23 11:11:47 -06:00
2024-03-26 00:38:08 -05:00
// send to archive.org
if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
if (linkType === "image" && !link.image?.startsWith("archive")) {
await imageHandler(link, imageExtension); // archive image (jpeg/png)
return;
} else if (linkType === "pdf" && !link.pdf?.startsWith("archive")) {
await pdfHandler(link); // archive pdf
return;
} else if (link.url) {
// archive url
2023-12-23 11:11:47 -06:00
await page.goto(link.url, { waitUntil: "domcontentloaded" });
const content = await page.content();
2023-12-23 11:11:47 -06:00
2024-03-26 00:38:08 -05:00
// Preview
2024-06-26 12:54:03 -05:00
if (
!link.preview?.startsWith("archives") &&
!link.preview?.startsWith("unavailable")
)
2024-06-27 11:39:03 -05:00
await handleArchivePreview(link, page);
// Readability
if (
!link.readable?.startsWith("archives") &&
!link.readable?.startsWith("unavailable")
)
await handleReadablility(content, link);
2023-12-23 11:11:47 -06:00
2024-06-18 11:27:29 -05:00
// Screenshot/PDF
2024-06-26 12:54:03 -05:00
if (
(!link.image?.startsWith("archives") &&
!link.image?.startsWith("unavailable")) ||
(!link.pdf?.startsWith("archives") &&
!link.pdf?.startsWith("unavailable"))
)
2024-06-27 11:39:03 -05:00
await handleScreenshotAndPdf(link, page, user);
// SingleFile
if (
!link.singlefile?.startsWith("archive") &&
!link.singlefile?.startsWith("unavailable") &&
user.archiveAsSinglefile &&
link.url
)
await handleMonolith(link, content);
}
})(),
timeoutPromise,
]);
} catch (err) {
console.log(err);
console.log("Failed Link details:", link);
throw err;
} finally {
const finalLink = await prisma.link.findUnique({
where: { id: link.id },
});
if (finalLink)
await prisma.link.update({
where: { id: link.id },
data: {
2023-12-23 11:11:47 -06:00
lastPreserved: new Date().toISOString(),
readable: !finalLink.readable?.startsWith("archives")
2023-12-15 14:47:08 -06:00
? "unavailable"
: undefined,
image: !finalLink.image?.startsWith("archives")
2023-12-15 14:47:08 -06:00
? "unavailable"
: undefined,
2024-03-15 13:41:41 -05:00
singlefile: !finalLink.singlefile?.startsWith("archives")
? "unavailable"
: undefined,
pdf: !finalLink.pdf?.startsWith("archives")
2023-12-15 14:47:08 -06:00
? "unavailable"
: undefined,
2023-12-23 11:11:47 -06:00
preview: !finalLink.preview?.startsWith("archives")
? "unavailable"
: undefined,
},
});
2023-12-19 16:20:09 -06:00
else {
2024-04-08 18:35:06 -05:00
await removeFiles(link.id, link.collectionId);
2023-12-19 16:20:09 -06:00
}
await browser.close();
}
}
const imageHandler = async ({ url, id }: Link, extension: string) => {
const image = await fetch(url as string).then((res) => res.blob());
const buffer = Buffer.from(await image.arrayBuffer());
const linkExists = await prisma.link.findUnique({
where: { id },
});
if (linkExists) {
await createFile({
data: buffer,
filePath: `archives/${linkExists.collectionId}/${id}.${extension}`,
});
await prisma.link.update({
where: { id },
data: {
image: `archives/${linkExists.collectionId}/${id}.${extension}`,
},
});
}
};
const pdfHandler = async ({ url, id }: Link) => {
const pdf = await fetch(url as string).then((res) => res.blob());
const buffer = Buffer.from(await pdf.arrayBuffer());
const linkExists = await prisma.link.findUnique({
where: { id },
});
if (linkExists) {
await createFile({
data: buffer,
filePath: `archives/${linkExists.collectionId}/${id}.pdf`,
});
await prisma.link.update({
where: { id },
data: {
pdf: `archives/${linkExists.collectionId}/${id}.pdf`,
},
});
}
};