- {isReady() ? (
- <>
- {screenshotAvailable(link) ? (
-
+ ) : undefined}
- {readabilityAvailable(link) ? (
-
+ ) : undefined}
+
+ {!isReady() && !atLeastOneFormatAvailable() ? (
+
+
- {singlefileAvailable(link) ? (
-
- ) : undefined}
- >
- ) : (
-
-
Link preservation is in the queue
@@ -213,7 +223,22 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
Please check back later to see the result
- )}
+ ) : !isReady() && atLeastOneFormatAvailable() ? (
+
+
+
+
+ There are more preserved formats in the queue
+
+
+ Please check back later to see the result
+
+
+ ) : undefined}
{
setTimeout(
() =>
@@ -54,9 +31,24 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
);
});
+ // allow user to configure a proxy
+ let browserOptions: LaunchOptions = {};
+ if (process.env.PROXY) {
+ browserOptions.proxy = {
+ server: process.env.PROXY,
+ bypass: process.env.PROXY_BYPASS,
+ username: process.env.PROXY_USERNAME,
+ password: process.env.PROXY_PASSWORD,
+ };
+ }
+
+ const browser = await chromium.launch(browserOptions);
+
try {
await Promise.race([
(async () => {
+ const user = link.collection?.owner;
+
const validatedUrl = link.url
? await validateUrlSize(link.url)
: undefined;
@@ -76,12 +68,7 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
else if (contentType.includes("image/png")) imageExtension = "png";
}
- const user = link.collection?.owner;
-
- // send to archive.org
- if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
-
- const targetLink = await prisma.link.update({
+ await prisma.link.update({
where: { id: link.id },
data: {
type: linkType,
@@ -106,6 +93,18 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
},
});
+ // SingleFile
+ if (
+ !link.singlefile?.startsWith("archive") &&
+ !link.singlefile?.startsWith("unavailable") &&
+ user.archiveAsSinglefile &&
+ link.url
+ )
+ await archiveAsSinglefile(link);
+
+ // send to archive.org
+ if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
+
if (linkType === "image" && !link.image?.startsWith("archive")) {
await imageHandler(link, imageExtension); // archive image (jpeg/png)
return;
@@ -115,230 +114,172 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
} else if (link.url) {
// archive url
+ const context = await browser.newContext({
+ ...devices["Desktop Chrome"],
+ ignoreHTTPSErrors: process.env.IGNORE_HTTPS_ERRORS === "true",
+ });
+
+ const page = await context.newPage();
+
await page.goto(link.url, { waitUntil: "domcontentloaded" });
const content = await page.content();
- // Singlefile
- if (
- user.archiveAsSinglefile &&
- !link.singlefile?.startsWith("archive")
- ) {
- let command = process.env.SINGLEFILE_ARCHIVE_COMMAND;
- let httpApi = process.env.SINGLEFILE_ARCHIVE_HTTP_API;
- if (command) {
- if (command.includes("{{URL}}")) {
- try {
- let html = execSync(command.replace("{{URL}}", link.url), {
- timeout: 60000,
- maxBuffer: 1024 * 1024 * 100,
- });
- await createFile({
- data: html,
- filePath: `archives/${targetLink.collectionId}/${link.id}.html`,
- });
- } catch (err) {
- console.error(
- "Error running SINGLEFILE_ARCHIVE_COMMAND:",
- err
- );
- }
- } else {
- console.error(
- "Invalid SINGLEFILE_ARCHIVE_COMMAND. Missing {{URL}}"
- );
- }
- } else if (httpApi) {
- try {
- let html = await axios.post(
- httpApi,
- { url: link.url },
- {
- headers: {
- "Content-Type": "application/x-www-form-urlencoded",
- },
- httpAgent: new Agent({ keepAlive: false }),
- }
- );
- await createFile({
- data: html.data,
- filePath: `archives/${targetLink.collectionId}/${link.id}.html`,
- });
- } catch (err) {
- console.error(
- "Error fetching Singlefile using SINGLEFILE_ARCHIVE_HTTP_API:",
- err
- );
- }
- } else {
- console.error(
- "No SINGLEFILE_ARCHIVE_COMMAND or SINGLEFILE_ARCHIVE_HTTP_API defined."
- );
- }
- }
-
// Readability
- const window = new JSDOM("").window;
- const purify = DOMPurify(window);
- const cleanedUpContent = purify.sanitize(content);
- const dom = new JSDOM(cleanedUpContent, { url: link.url || "" });
- const article = new Readability(dom.window.document).parse();
- const articleText = article?.textContent
- .replace(/ +(?= )/g, "") // strip out multiple spaces
- .replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks
if (
- articleText &&
- articleText !== "" &&
- !link.readable?.startsWith("archive")
- ) {
- await createFile({
- data: JSON.stringify(article),
- filePath: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
- });
-
- await prisma.link.update({
- where: { id: link.id },
- data: {
- readable: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
- textContent: articleText,
- },
- });
- }
+ !link.readable?.startsWith("archives") &&
+ !link.readable?.startsWith("unavailable")
+ )
+ await archiveAsReadability(content, link);
// Preview
- const ogImageUrl = await page.evaluate(() => {
- const metaTag = document.querySelector('meta[property="og:image"]');
- return metaTag ? (metaTag as any).content : null;
- });
+ if (
+ !link.preview?.startsWith("archives") &&
+ !link.preview?.startsWith("unavailable")
+ ) {
+ const ogImageUrl = await page.evaluate(() => {
+ const metaTag = document.querySelector(
+ 'meta[property="og:image"]'
+ );
+ return metaTag ? (metaTag as any).content : null;
+ });
- createFolder({
- filePath: `archives/preview/${link.collectionId}`,
- });
+ createFolder({
+ filePath: `archives/preview/${link.collectionId}`,
+ });
- if (ogImageUrl) {
- console.log("Found og:image URL:", ogImageUrl);
+ if (ogImageUrl) {
+ console.log("Found og:image URL:", ogImageUrl);
- // Download the image
- const imageResponse = await page.goto(ogImageUrl);
+ // Download the image
+ const imageResponse = await page.goto(ogImageUrl);
- // Check if imageResponse is not null
- if (imageResponse && !link.preview?.startsWith("archive")) {
- const buffer = await imageResponse.body();
+ // Check if imageResponse is not null
+ if (imageResponse && !link.preview?.startsWith("archive")) {
+ const buffer = await imageResponse.body();
- // Check if buffer is not null
- if (buffer) {
- // Load the image using Jimp
- Jimp.read(buffer, async (err, image) => {
- if (image && !err) {
- image?.resize(1280, Jimp.AUTO).quality(20);
- const processedBuffer = await image?.getBufferAsync(
- Jimp.MIME_JPEG
- );
+ // Check if buffer is not null
+ if (buffer) {
+ // Load the image using Jimp
+ Jimp.read(buffer, async (err, image) => {
+ if (image && !err) {
+ image?.resize(1280, Jimp.AUTO).quality(20);
+ const processedBuffer = await image?.getBufferAsync(
+ Jimp.MIME_JPEG
+ );
- createFile({
- data: processedBuffer,
- filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
- }).then(() => {
- return prisma.link.update({
- where: { id: link.id },
- data: {
- preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
- },
+ createFile({
+ data: processedBuffer,
+ filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
+ }).then(() => {
+ return prisma.link.update({
+ where: { id: link.id },
+ data: {
+ preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
+ },
+ });
});
- });
- }
- }).catch((err) => {
- console.error("Error processing the image:", err);
- });
- } else {
- console.log("No image data found.");
+ }
+ }).catch((err) => {
+ console.error("Error processing the image:", err);
+ });
+ } else {
+ console.log("No image data found.");
+ }
}
- }
- await page.goBack();
- } else if (!link.preview?.startsWith("archive")) {
- console.log("No og:image found");
- await page
- .screenshot({ type: "jpeg", quality: 20 })
- .then((screenshot) => {
- return createFile({
- data: screenshot,
- filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
- });
- })
- .then(() => {
- return prisma.link.update({
- where: { id: link.id },
- data: {
- preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
- },
- });
- });
- }
-
- // Screenshot/PDF
- await page.evaluate(
- autoScroll,
- Number(process.env.AUTOSCROLL_TIMEOUT) || 30
- );
-
- // Check if the user hasn't deleted the link by the time we're done scrolling
- const linkExists = await prisma.link.findUnique({
- where: { id: link.id },
- });
- if (linkExists) {
- const processingPromises = [];
-
- if (
- user.archiveAsScreenshot &&
- !link.image?.startsWith("archive")
- ) {
- processingPromises.push(
- page.screenshot({ fullPage: true }).then((screenshot) => {
+ await page.goBack();
+ } else if (!link.preview?.startsWith("archive")) {
+ console.log("No og:image found");
+ await page
+ .screenshot({ type: "jpeg", quality: 20 })
+ .then((screenshot) => {
return createFile({
data: screenshot,
- filePath: `archives/${linkExists.collectionId}/${link.id}.png`,
+ filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
});
})
- );
+ .then(() => {
+ return prisma.link.update({
+ where: { id: link.id },
+ data: {
+ preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
+ },
+ });
+ });
}
+ }
- // apply administrator's defined pdf margins or default to 15px
- const margins = {
- top: process.env.PDF_MARGIN_TOP || "15px",
- bottom: process.env.PDF_MARGIN_BOTTOM || "15px",
- };
+ if (
+ (!link.image?.startsWith("archives") &&
+ !link.image?.startsWith("unavailable")) ||
+ (!link.pdf?.startsWith("archives") &&
+ !link.pdf?.startsWith("unavailable"))
+ ) {
+ // Screenshot/PDF
+ await page.evaluate(
+ autoScroll,
+ Number(process.env.AUTOSCROLL_TIMEOUT) || 30
+ );
- if (user.archiveAsPDF && !link.pdf?.startsWith("archive")) {
- processingPromises.push(
- page
- .pdf({
- width: "1366px",
- height: "1931px",
- printBackground: true,
- margin: margins,
- })
- .then((pdf) => {
+ // Check if the user hasn't deleted the link by the time we're done scrolling
+ const linkExists = await prisma.link.findUnique({
+ where: { id: link.id },
+ });
+ if (linkExists) {
+ const processingPromises = [];
+
+ if (
+ user.archiveAsScreenshot &&
+ !link.image?.startsWith("archive")
+ ) {
+ processingPromises.push(
+ page.screenshot({ fullPage: true }).then((screenshot) => {
return createFile({
- data: pdf,
- filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`,
+ data: screenshot,
+ filePath: `archives/${linkExists.collectionId}/${link.id}.png`,
});
})
- );
+ );
+ }
+
+ // apply administrator's defined pdf margins or default to 15px
+ const margins = {
+ top: process.env.PDF_MARGIN_TOP || "15px",
+ bottom: process.env.PDF_MARGIN_BOTTOM || "15px",
+ };
+
+ if (user.archiveAsPDF && !link.pdf?.startsWith("archive")) {
+ processingPromises.push(
+ page
+ .pdf({
+ width: "1366px",
+ height: "1931px",
+ printBackground: true,
+ margin: margins,
+ })
+ .then((pdf) => {
+ return createFile({
+ data: pdf,
+ filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`,
+ });
+ })
+ );
+ }
+ await Promise.allSettled(processingPromises);
+ await prisma.link.update({
+ where: { id: link.id },
+ data: {
+ image: user.archiveAsScreenshot
+ ? `archives/${linkExists.collectionId}/${link.id}.png`
+ : undefined,
+ pdf: user.archiveAsPDF
+ ? `archives/${linkExists.collectionId}/${link.id}.pdf`
+ : undefined,
+ },
+ });
}
- await Promise.allSettled(processingPromises);
- await prisma.link.update({
- where: { id: link.id },
- data: {
- image: user.archiveAsScreenshot
- ? `archives/${linkExists.collectionId}/${link.id}.png`
- : undefined,
- pdf: user.archiveAsPDF
- ? `archives/${linkExists.collectionId}/${link.id}.pdf`
- : undefined,
- },
- });
}
}
})(),
diff --git a/lib/api/preservationScheme/archiveAsReadablility.ts b/lib/api/preservationScheme/archiveAsReadablility.ts
new file mode 100644
index 0000000..dec27da
--- /dev/null
+++ b/lib/api/preservationScheme/archiveAsReadablility.ts
@@ -0,0 +1,44 @@
+import { Readability } from "@mozilla/readability";
+import { JSDOM } from "jsdom";
+import DOMPurify from "dompurify";
+import { prisma } from "../db";
+import createFile from "../storage/createFile";
+import { Link } from "@prisma/client";
+
+const archiveAsReadablility = async (content: string, link: Link) => {
+ const window = new JSDOM("").window;
+ const purify = DOMPurify(window);
+ const cleanedUpContent = purify.sanitize(content);
+ const dom = new JSDOM(cleanedUpContent, { url: link.url || "" });
+ const article = new Readability(dom.window.document).parse();
+ const articleText = article?.textContent
+ .replace(/ +(?= )/g, "") // strip out multiple spaces
+ .replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks
+ if (
+ articleText &&
+ articleText !== "" &&
+ !link.readable?.startsWith("archive")
+ ) {
+ const collectionId = (
+ await prisma.link.findUnique({
+ where: { id: link.id },
+ select: { collectionId: true },
+ })
+ )?.collectionId;
+
+ await createFile({
+ data: JSON.stringify(article),
+ filePath: `archives/${collectionId}/${link.id}_readability.json`,
+ });
+
+ await prisma.link.update({
+ where: { id: link.id },
+ data: {
+ readable: `archives/${collectionId}/${link.id}_readability.json`,
+ textContent: articleText,
+ },
+ });
+ }
+};
+
+export default archiveAsReadablility;
diff --git a/lib/api/preservationScheme/archiveAsSinglefile.ts b/lib/api/preservationScheme/archiveAsSinglefile.ts
new file mode 100644
index 0000000..0c739b8
--- /dev/null
+++ b/lib/api/preservationScheme/archiveAsSinglefile.ts
@@ -0,0 +1,111 @@
+import { execSync } from "child_process";
+import createFile from "../storage/createFile";
+import axios from "axios";
+import { Agent } from "http";
+import { prisma } from "../db";
+import { Link } from "@prisma/client";
+
+const archiveAsSinglefile = async (link: Link) => {
+ if (!link.url) return;
+
+ let command = process.env.SINGLEFILE_ARCHIVE_COMMAND;
+ let httpApi = process.env.SINGLEFILE_ARCHIVE_HTTP_API;
+ if (command) {
+ if (command.includes("{{URL}}")) {
+ try {
+ let html = execSync(command.replace("{{URL}}", link.url), {
+ timeout: 120000,
+ maxBuffer: 1024 * 1024 * 30,
+ });
+
+ if (!html.length) {
+ console.error(
+ "Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer"
+ );
+ return;
+ }
+
+ const collectionId = (
+ await prisma.link.findUnique({
+ where: { id: link.id },
+ select: { collectionId: true },
+ })
+ )?.collectionId;
+
+ if (!collectionId) {
+ console.error(
+ "Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found"
+ );
+ return;
+ }
+
+ await createFile({
+ data: html,
+ filePath: `archives/${collectionId}/${link.id}.html`,
+ }).then(async () => {
+ await prisma.link.update({
+ where: { id: link.id },
+ data: {
+ singlefile: `archives/${collectionId}/${link.id}.html`,
+ },
+ });
+ });
+ } catch (err) {
+ console.error("Error running SINGLEFILE_ARCHIVE_COMMAND:", err);
+ }
+ } else {
+ console.error("Invalid SINGLEFILE_ARCHIVE_COMMAND. Missing {{URL}}");
+ }
+ } else if (httpApi) {
+ try {
+ let html = await axios.post(
+ httpApi,
+ { url: link.url },
+ {
+ headers: {
+ "Content-Type": "application/x-www-form-urlencoded",
+ },
+ httpAgent: new Agent({ keepAlive: false }),
+ }
+ );
+
+ if (!html.data.length) {
+ console.error("Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer");
+ return;
+ }
+
+ const collectionId = (
+ await prisma.link.findUnique({
+ where: { id: link.id },
+ select: { collectionId: true },
+ })
+ )?.collectionId;
+
+ if (!collectionId) {
+ console.error(
+ "Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found"
+ );
+ return;
+ }
+
+ await createFile({
+ data: html.data,
+ filePath: `archives/${collectionId}/${link.id}.html`,
+ }).then(async () => {
+ await prisma.link.update({
+ where: { id: link.id },
+ data: {
+ singlefile: `archives/${collectionId}/${link.id}.html`,
+ },
+ });
+ });
+ } catch (err) {
+ console.error(
+ "Error fetching Singlefile using SINGLEFILE_ARCHIVE_HTTP_API:",
+ err
+ );
+ }
+ }
+};
+
+export default archiveAsSinglefile;
diff --git a/lib/api/sendToWayback.ts b/lib/api/preservationScheme/sendToWayback.ts
similarity index 100%
rename from lib/api/sendToWayback.ts
rename to lib/api/preservationScheme/sendToWayback.ts