commit 8cbb3e338d65d341df1e71fc112eb4b0aefd090a Author: Jaidyn Ann <10477760+JadedCtrl@users.noreply.github.com> Date: Fri May 24 20:42:21 2024 -0500 Init diff --git a/mirror-img.lisp b/mirror-img.lisp new file mode 100644 index 0000000..68e3e7b --- /dev/null +++ b/mirror-img.lisp @@ -0,0 +1,157 @@ +;; Copyright © 2024 Jaidyn Ann +;; +;; This program is free software: you can redistribute it and/or +;; modify it under the terms of the GNU General Public License as +;; published by the Free Software Foundation, either version 3 of +;; the License, or (at your option) any later version. +;; +;; This program is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with this program. If not, see . + +(defvar *src-tags* '("audio" "embed" "img" "input" "script" "source" "track" "video")) +(defvar *href-tags* '("link")) + + + +;;; Mirror-img +;;; ———————————————————————————————————————— +(defun mirror-remote-urls (html-file base-dir &optional (tags '("img"))) + "Attempt to mirror all remote HREF/SRC URLs of an HTML file’s tags, +downloading them to base-dir. The remote URLs will be replaced with the +local version, where downloading was successful. +Returns a string of the modified page’s HTML." + (let* ((dom (lquery:$ (lquery:initialize html-file))) + (url→pathname-alist + (mirror-linked-urls dom (append *src-tags* *href-tags*) base-dir)) + (url→relative-url-alist + (mapcar (lambda (pair) (cons (car pair) (namestring (cdr pair)))) + url→pathname-alist))) + (substitute-urls dom + url→relative-url-alist + (append *src-tags* *href-tags*)) + (aref (lquery:$ dom (serialize)) 0))) + + +(defun mirror-linked-urls (dom &optional (tags '("link" "img" "script")) base-dir) + "Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery +DOM, downloading them to either the current-working directory or the base-dir. +Returns an associative list containing pairs of remote-URLs to local pathnames. +Any unsuccessfuly-mirrored URL is excluded from the alist. +For example: + ((“https://downloaded.com/a/b.jpg” . #p“a/b.jpg”))" + (remove-if + #'not + (mapcar (lambda (url) + (let ((download-path + (ensure-directories-exist + (mirrored-pathname url base-dir)))) + (if (http-fetch url download-path) + (cons url download-path)))) + (linked-urls dom tags)))) + + +(defun substitute-urls (dom substitution-alist &optional (tags '("img"))) + "Replace SRC or HREF attributes of certain tags in an LQuery DOM, based on a +substitution associative list. The substitution alist is made up of cons-cells +mapping replacee URLs to substition URLs. +For instance, with the alist cons-cell (“https://xwx.moe/b.jpg” . “a/b.jpg”), +any HREF/SRC values of “https://xwx.moe/b.jpg” will be replaced with “a/b.jpg”, +among the given tags." + (mapcar + (lambda (tag) + (lquery:$ dom tag + #'(lambda (nodes) + (loop for node across nodes do + (let ((substitution + (cdr (assoc (node-url node) + substitution-alist :test #'equal)))) + (if substitution + (set-node-url node substitution))))))) + tags) + dom) + + +(defun mirrored-pathname (url &optional base-dir) + "Given a URL, return the corresponding path we would download it to, were we +to mirror it." + (let* ((url (quri:uri url)) + (path (quri:uri-path url))) + (pathname + (format nil "~@[~A/~]~A/~A~@[.~A~]" + base-dir + (quri:uri-host url) + (pathname-name path) + (pathname-type path))))) + + + +;;; DOM-parsing +;;; ———————————————————————————————————————— +(defun linked-urls (dom &optional (tags '("link" "img" "script"))) + "Return a list of all URLs in the LQuery DOM contained in the given tags’ +HREF and SRC attributes." + (flet ((url-list-of-tag (tag) + (remove-if + #'not + (append + (coerce (lquery:$ dom tag (attr "src")) 'list) + (coerce (lquery:$ dom tag (attr "href")) 'list))))) + (loop for tag in tags + append (url-list-of-tag tag)))) + + +(defun node-url (node) + "Return the SRC or HREF attribute of an LQuery node, if such a thing exists." + (and node + (or (lquery-funcs:attr node "src") + (lquery-funcs:attr node "href")))) + + +(defun set-node-url (node url) + "Set the SRC or HREF attribute of an LQuery node (based on which attribute is +already set)." + (and node + (if (lquery-funcs:attr node "src") + (lquery-funcs:attr node "src" url)) + (if (lquery-funcs:attr node "href") + (lquery-funcs:attr node "href" url)))) + + + +;;; Util +;;; ———————————————————————————————————————— +(defun http-fetch (url path) + "Download a URL to a path; if successful, returns the pathname. Otherwise, NIL." + (handler-case + (and (dexador:fetch (url-encode-uri (quri:uri url)) path) + (pathname path)) + (file-exists () + path) + (error nil))) + + +(defun url-encode-uri (uri) + "URL-encode the path component of a URI. For example, +“https://invalid.tld/dad alive.jpg” → “https://invalid.tld/dad%20alive.jpg”" + (let ((uri (quri:uri uri))) + (format nil "~@[~A://~]~A~@[:~A~]~A" + (quri:uri-scheme uri) + (quri:uri-host uri) + (quri:uri-port uri) + (url-encode-path (quri:uri-path uri))))) + + +(defun url-encode-path (path) + "URL-encode a pathname’s components (directory and filenames). For example, +“/images!/dad alive.jpg” → “/images%21/dad%20alive.jpg”" + (let* ((split (split-sequence:split-sequence #\/ path)) + (path-parts (if (not (car split)) '("") split))) + (reduce + (lambda (a b) + (format nil "~A/~A" a b)) + (mapcar #'quri:url-encode path-parts))))