code improvements

This commit is contained in:
daniel31x13 2024-03-26 01:38:08 -04:00
parent 797ddc4b73
commit 2b8f7d4be2
5 changed files with 401 additions and 280 deletions

View File

@ -18,6 +18,7 @@ import {
import PreservedFormatRow from "@/components/PreserverdFormatRow";
import useAccountStore from "@/store/account";
import getPublicUserData from "@/lib/client/getPublicUserData";
import { BeatLoader } from "react-spinners";
type Props = {
onClose: Function;
@ -87,6 +88,15 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
);
};
const atLeastOneFormatAvailable = () => {
return (
screenshotAvailable(link) ||
pdfAvailable(link) ||
readabilityAvailable(link) ||
singlefileAvailable(link)
);
};
useEffect(() => {
(async () => {
const data = await getLink(link.id as number, isPublic);
@ -143,11 +153,10 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
<div className="divider mb-2 mt-1"></div>
{isReady() &&
(screenshotAvailable(link) ||
{screenshotAvailable(link) ||
pdfAvailable(link) ||
readabilityAvailable(link) ||
singlefileAvailable(link)) ? (
singlefileAvailable(link) ? (
<p className="mb-3">
The following formats are available for this link:
</p>
@ -156,8 +165,6 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
)}
<div className={`flex flex-col gap-3`}>
{isReady() ? (
<>
{screenshotAvailable(link) ? (
<PreservedFormatRow
name={"Screenshot"}
@ -182,6 +189,16 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
/>
) : undefined}
{singlefileAvailable(link) ? (
<PreservedFormatRow
name={"SingleFile (Full Copy)"}
icon={"bi-filetype-html"}
format={ArchivedFormat.singlefile}
activeLink={link}
downloadable={true}
/>
) : undefined}
{readabilityAvailable(link) ? (
<PreservedFormatRow
name={"Readable"}
@ -191,21 +208,14 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
/>
) : undefined}
{singlefileAvailable(link) ? (
<PreservedFormatRow
name={"Singlefile"}
icon={"bi-filetype-html"}
format={ArchivedFormat.singlefile}
activeLink={link}
downloadable={true}
{!isReady() && !atLeastOneFormatAvailable() ? (
<div className={`w-full h-full flex flex-col justify-center p-10`}>
<BeatLoader
color="oklch(var(--p))"
className="mx-auto mb-3"
size={30}
/>
) : undefined}
</>
) : (
<div
className={`w-full h-full flex flex-col justify-center p-10 skeleton bg-base-200`}
>
<i className="bi-stack drop-shadow text-primary text-8xl mx-auto mb-5"></i>
<p className="text-center text-2xl">
Link preservation is in the queue
</p>
@ -213,7 +223,22 @@ export default function PreservedFormatsModal({ onClose, activeLink }: Props) {
Please check back later to see the result
</p>
</div>
)}
) : !isReady() && atLeastOneFormatAvailable() ? (
<div className={`w-full h-full flex flex-col justify-center p-5`}>
<BeatLoader
color="oklch(var(--p))"
className="mx-auto mb-3"
size={20}
/>
<p className="text-center">
There are more preserved formats in the queue
</p>
<p className="text-center text-sm">
Please check back later to see the result
</p>
</div>
) : undefined}
<div
className={`flex flex-col sm:flex-row gap-3 items-center justify-center ${

View File

@ -1,18 +1,14 @@
import { LaunchOptions, chromium, devices } from "playwright";
import { prisma } from "./db";
import createFile from "./storage/createFile";
import sendToWayback from "./sendToWayback";
import { Readability } from "@mozilla/readability";
import { JSDOM } from "jsdom";
import DOMPurify from "dompurify";
import sendToWayback from "./preservationScheme/sendToWayback";
import { Collection, Link, User } from "@prisma/client";
import validateUrlSize from "./validateUrlSize";
import removeFile from "./storage/removeFile";
import Jimp from "jimp";
import { execSync } from "child_process";
import axios from "axios";
import { Agent } from "http";
import createFolder from "./storage/createFolder";
import archiveAsSinglefile from "./preservationScheme/archiveAsSinglefile";
import archiveAsReadability from "./preservationScheme/archiveAsReadablility";
type LinksAndCollectionAndOwner = Link & {
collection: Collection & {
@ -23,25 +19,6 @@ type LinksAndCollectionAndOwner = Link & {
const BROWSER_TIMEOUT = Number(process.env.BROWSER_TIMEOUT) || 5;
export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
// allow user to configure a proxy
let browserOptions: LaunchOptions = {};
if (process.env.PROXY) {
browserOptions.proxy = {
server: process.env.PROXY,
bypass: process.env.PROXY_BYPASS,
username: process.env.PROXY_USERNAME,
password: process.env.PROXY_PASSWORD,
};
}
const browser = await chromium.launch(browserOptions);
const context = await browser.newContext({
...devices["Desktop Chrome"],
ignoreHTTPSErrors: process.env.IGNORE_HTTPS_ERRORS === "true",
});
const page = await context.newPage();
const timeoutPromise = new Promise((_, reject) => {
setTimeout(
() =>
@ -54,9 +31,24 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
);
});
// allow user to configure a proxy
let browserOptions: LaunchOptions = {};
if (process.env.PROXY) {
browserOptions.proxy = {
server: process.env.PROXY,
bypass: process.env.PROXY_BYPASS,
username: process.env.PROXY_USERNAME,
password: process.env.PROXY_PASSWORD,
};
}
const browser = await chromium.launch(browserOptions);
try {
await Promise.race([
(async () => {
const user = link.collection?.owner;
const validatedUrl = link.url
? await validateUrlSize(link.url)
: undefined;
@ -76,12 +68,7 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
else if (contentType.includes("image/png")) imageExtension = "png";
}
const user = link.collection?.owner;
// send to archive.org
if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
const targetLink = await prisma.link.update({
await prisma.link.update({
where: { id: link.id },
data: {
type: linkType,
@ -106,6 +93,18 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
},
});
// SingleFile
if (
!link.singlefile?.startsWith("archive") &&
!link.singlefile?.startsWith("unavailable") &&
user.archiveAsSinglefile &&
link.url
)
await archiveAsSinglefile(link);
// send to archive.org
if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
if (linkType === "image" && !link.image?.startsWith("archive")) {
await imageHandler(link, imageExtension); // archive image (jpeg/png)
return;
@ -115,100 +114,34 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
} else if (link.url) {
// archive url
const context = await browser.newContext({
...devices["Desktop Chrome"],
ignoreHTTPSErrors: process.env.IGNORE_HTTPS_ERRORS === "true",
});
const page = await context.newPage();
await page.goto(link.url, { waitUntil: "domcontentloaded" });
const content = await page.content();
// Singlefile
if (
user.archiveAsSinglefile &&
!link.singlefile?.startsWith("archive")
) {
let command = process.env.SINGLEFILE_ARCHIVE_COMMAND;
let httpApi = process.env.SINGLEFILE_ARCHIVE_HTTP_API;
if (command) {
if (command.includes("{{URL}}")) {
try {
let html = execSync(command.replace("{{URL}}", link.url), {
timeout: 60000,
maxBuffer: 1024 * 1024 * 100,
});
await createFile({
data: html,
filePath: `archives/${targetLink.collectionId}/${link.id}.html`,
});
} catch (err) {
console.error(
"Error running SINGLEFILE_ARCHIVE_COMMAND:",
err
);
}
} else {
console.error(
"Invalid SINGLEFILE_ARCHIVE_COMMAND. Missing {{URL}}"
);
}
} else if (httpApi) {
try {
let html = await axios.post(
httpApi,
{ url: link.url },
{
headers: {
"Content-Type": "application/x-www-form-urlencoded",
},
httpAgent: new Agent({ keepAlive: false }),
}
);
await createFile({
data: html.data,
filePath: `archives/${targetLink.collectionId}/${link.id}.html`,
});
} catch (err) {
console.error(
"Error fetching Singlefile using SINGLEFILE_ARCHIVE_HTTP_API:",
err
);
}
} else {
console.error(
"No SINGLEFILE_ARCHIVE_COMMAND or SINGLEFILE_ARCHIVE_HTTP_API defined."
);
}
}
// Readability
const window = new JSDOM("").window;
const purify = DOMPurify(window);
const cleanedUpContent = purify.sanitize(content);
const dom = new JSDOM(cleanedUpContent, { url: link.url || "" });
const article = new Readability(dom.window.document).parse();
const articleText = article?.textContent
.replace(/ +(?= )/g, "") // strip out multiple spaces
.replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks
if (
articleText &&
articleText !== "" &&
!link.readable?.startsWith("archive")
) {
await createFile({
data: JSON.stringify(article),
filePath: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
});
await prisma.link.update({
where: { id: link.id },
data: {
readable: `archives/${targetLink.collectionId}/${link.id}_readability.json`,
textContent: articleText,
},
});
}
!link.readable?.startsWith("archives") &&
!link.readable?.startsWith("unavailable")
)
await archiveAsReadability(content, link);
// Preview
if (
!link.preview?.startsWith("archives") &&
!link.preview?.startsWith("unavailable")
) {
const ogImageUrl = await page.evaluate(() => {
const metaTag = document.querySelector('meta[property="og:image"]');
const metaTag = document.querySelector(
'meta[property="og:image"]'
);
return metaTag ? (metaTag as any).content : null;
});
@ -276,7 +209,14 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
});
});
}
}
if (
(!link.image?.startsWith("archives") &&
!link.image?.startsWith("unavailable")) ||
(!link.pdf?.startsWith("archives") &&
!link.pdf?.startsWith("unavailable"))
) {
// Screenshot/PDF
await page.evaluate(
autoScroll,
@ -341,6 +281,7 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
});
}
}
}
})(),
timeoutPromise,
]);

View File

@ -0,0 +1,44 @@
import { Readability } from "@mozilla/readability";
import { JSDOM } from "jsdom";
import DOMPurify from "dompurify";
import { prisma } from "../db";
import createFile from "../storage/createFile";
import { Link } from "@prisma/client";
const archiveAsReadablility = async (content: string, link: Link) => {
const window = new JSDOM("").window;
const purify = DOMPurify(window);
const cleanedUpContent = purify.sanitize(content);
const dom = new JSDOM(cleanedUpContent, { url: link.url || "" });
const article = new Readability(dom.window.document).parse();
const articleText = article?.textContent
.replace(/ +(?= )/g, "") // strip out multiple spaces
.replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks
if (
articleText &&
articleText !== "" &&
!link.readable?.startsWith("archive")
) {
const collectionId = (
await prisma.link.findUnique({
where: { id: link.id },
select: { collectionId: true },
})
)?.collectionId;
await createFile({
data: JSON.stringify(article),
filePath: `archives/${collectionId}/${link.id}_readability.json`,
});
await prisma.link.update({
where: { id: link.id },
data: {
readable: `archives/${collectionId}/${link.id}_readability.json`,
textContent: articleText,
},
});
}
};
export default archiveAsReadablility;

View File

@ -0,0 +1,111 @@
import { execSync } from "child_process";
import createFile from "../storage/createFile";
import axios from "axios";
import { Agent } from "http";
import { prisma } from "../db";
import { Link } from "@prisma/client";
const archiveAsSinglefile = async (link: Link) => {
if (!link.url) return;
let command = process.env.SINGLEFILE_ARCHIVE_COMMAND;
let httpApi = process.env.SINGLEFILE_ARCHIVE_HTTP_API;
if (command) {
if (command.includes("{{URL}}")) {
try {
let html = execSync(command.replace("{{URL}}", link.url), {
timeout: 120000,
maxBuffer: 1024 * 1024 * 30,
});
if (!html.length) {
console.error(
"Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer"
);
return;
}
const collectionId = (
await prisma.link.findUnique({
where: { id: link.id },
select: { collectionId: true },
})
)?.collectionId;
if (!collectionId) {
console.error(
"Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found"
);
return;
}
await createFile({
data: html,
filePath: `archives/${collectionId}/${link.id}.html`,
}).then(async () => {
await prisma.link.update({
where: { id: link.id },
data: {
singlefile: `archives/${collectionId}/${link.id}.html`,
},
});
});
} catch (err) {
console.error("Error running SINGLEFILE_ARCHIVE_COMMAND:", err);
}
} else {
console.error("Invalid SINGLEFILE_ARCHIVE_COMMAND. Missing {{URL}}");
}
} else if (httpApi) {
try {
let html = await axios.post(
httpApi,
{ url: link.url },
{
headers: {
"Content-Type": "application/x-www-form-urlencoded",
},
httpAgent: new Agent({ keepAlive: false }),
}
);
if (!html.data.length) {
console.error("Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer");
return;
}
const collectionId = (
await prisma.link.findUnique({
where: { id: link.id },
select: { collectionId: true },
})
)?.collectionId;
if (!collectionId) {
console.error(
"Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found"
);
return;
}
await createFile({
data: html.data,
filePath: `archives/${collectionId}/${link.id}.html`,
}).then(async () => {
await prisma.link.update({
where: { id: link.id },
data: {
singlefile: `archives/${collectionId}/${link.id}.html`,
},
});
});
} catch (err) {
console.error(
"Error fetching Singlefile using SINGLEFILE_ARCHIVE_HTTP_API:",
err
);
}
}
};
export default archiveAsSinglefile;