mirror-img/mirror-img.lisp

168 lines
6.4 KiB
Common Lisp
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

;;;; mirror-img: Mirror an HTML pages remote images, CSS, etc.
;; Copyright © 2024 Jaidyn Ann <jadedctrl@posteo.at>
;;
;; This program is free software: you can redistribute it and/or
;; modify it under the terms of the GNU General Public License as
;; published by the Free Software Foundation, either version 3 of
;; the License, or (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
(defpackage :mirror-img
(:use :cl)
(:export :mirror-img))
(in-package :mirror-img)
(defvar *src-tags* '("audio" "embed" "img" "input" "script" "source" "track" "video"))
(defvar *href-tags* '("link"))
;;; Mirror-img
;;; ————————————————————————————————————————
;; pathname pathname list → string
(defun mirror-img (html-file base-dir &optional (tags '("img")))
"Attempt to mirror all remote HREF/SRC URLs of an HTML files tags,
downloading them to base-dir. The remote URLs will be replaced with the
local version, where downloading was successful.
Returns a string of the modified pages HTML."
(let* ((dom (lquery:$ (lquery:initialize html-file)))
(urlpathname-alist
(mirror-linked-urls dom (append *src-tags* *href-tags*) base-dir))
(urlrelative-url-alist
(mapcar (lambda (pair) (cons (car pair) (namestring (cdr pair))))
urlpathname-alist)))
(substitute-urls dom
urlrelative-url-alist
(append *src-tags* *href-tags*))
(aref (lquery:$ dom (serialize)) 0)))
;; lquery-dom list pathname → alist
(defun mirror-linked-urls (dom &optional (tags '("link" "img" "script")) base-dir)
"Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery
DOM, downloading them to either the current-working directory or the base-dir.
Returns an associative list containing pairs of remote-URLs to local pathnames.
Any unsuccessfuly-mirrored URL is excluded from the alist.
For example:
((“https://downloaded.com/a/b.jpg” . #p“a/b.jpg”))"
(remove-if
#'not
(mapcar (lambda (url)
(let ((download-path
(ensure-directories-exist
(mirrored-pathname url base-dir))))
(if (http-fetch url download-path)
(cons url download-path))))
(linked-urls dom tags))))
;; lquery-dom alist list → nil
(defun substitute-urls (dom substitution-alist &optional (tags '("img")))
"Replace SRC or HREF attributes of certain tags in an LQuery DOM, based on a
substitution associative list. The substitution alist is made up of cons-cells
mapping replacee URLs to substition URLs.
For instance, with the alist cons-cell (“https://xwx.moe/b.jpg” . “a/b.jpg”),
any HREF/SRC values of “https://xwx.moe/b.jpg” will be replaced with “a/b.jpg”,
among the given tags."
(mapcar
(lambda (tag)
(lquery:$ dom tag
#'(lambda (nodes)
(loop for node across nodes do
(let ((substitution
(cdr (assoc (node-url node)
substitution-alist :test #'equal))))
(if substitution
(set-node-url node substitution)))))))
tags)
dom)
;; string pathname → pathname
(defun mirrored-pathname (url &optional base-dir)
"Given a URL, return the corresponding path we would download it to, were we
to mirror it."
(let* ((url (quri:uri url))
(path (quri:uri-path url)))
(pathname
(format nil "~@[~A/~]~A/~A~@[.~A~]"
base-dir
(quri:uri-host url)
(pathname-name path)
(pathname-type path)))))
;;; DOM-parsing
;;; ————————————————————————————————————————
;; lquery-dom list → list
(defun linked-urls (dom &optional (tags '("link" "img" "script")))
"Return a list of all URLs in the LQuery DOM contained in the given tags
HREF and SRC attributes."
(flet ((url-list-of-tag (tag)
(remove-if
#'not
(append
(coerce (lquery:$ dom tag (attr "src")) 'list)
(coerce (lquery:$ dom tag (attr "href")) 'list)))))
(loop for tag in tags
append (url-list-of-tag tag))))
;; lquery-node → string
(defun node-url (node)
"Return the SRC or HREF attribute of an LQuery node, if such a thing exists."
(and node
(or (lquery-funcs:attr node "src")
(lquery-funcs:attr node "href"))))
;; lquery-node → nil
(defun set-node-url (node url)
"Set the SRC or HREF attribute of an LQuery node (based on which attribute is
already set)."
(and node
(if (lquery-funcs:attr node "src")
(lquery-funcs:attr node "src" url))
(if (lquery-funcs:attr node "href")
(lquery-funcs:attr node "href" url))))
;;; Util
;;; ————————————————————————————————————————
;; string pathname → pathname or nil
(defun http-fetch (url path)
"Download a URL to a path; if successful, returns the pathname. Otherwise, NIL."
(handler-case
(and (dexador:fetch (url-encode-uri (quri:uri url)) path)
(pathname path))
(file-exists ()
path)
(error nil)))
;; string → string
(defun url-encode-uri (uri)
"URL-encode the path component of a URI. For example,
“https://invalid.tld/dad alive.jpg” → “https://invalid.tld/dad%20alive.jpg”"
(let ((uri (quri:uri uri)))
(format nil "~@[~A://~]~A~@[:~A~]~A"
(quri:uri-scheme uri)
(quri:uri-host uri)
(quri:uri-port uri)
(url-encode-path (quri:uri-path uri)))))
;; string → string
(defun url-encode-path (path)
"URL-encode a pathnames components (directory and filenames). For example,
“/images!/dad alive.jpg” → “/images%21/dad%20alive.jpg”"
(let* ((split (split-sequence:split-sequence #\/ path))
(path-parts (if (not (car split)) '("") split)))
(reduce
(lambda (a b)
(format nil "~A/~A" a b))
(mapcar #'quri:url-encode path-parts))))