mirror-img/mirror-img.lisp

173 lines
6.3 KiB
Common Lisp
Raw Normal View History

2024-05-24 20:42:21 -05:00
;; Copyright © 2024 Jaidyn Ann <jadedctrl@posteo.at>
;;
;; This program is free software: you can redistribute it and/or
;; modify it under the terms of the GNU General Public License as
;; published by the Free Software Foundation, either version 3 of
;; the License, or (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
2024-05-24 20:58:37 -05:00
(defpackage :mirror-img
(:use :cl)
(:export :mirror-img))
(in-package :mirror-img)
2024-05-24 20:42:21 -05:00
(defvar *src-tags* '("audio" "embed" "img" "input" "script" "source" "track" "video"))
(defvar *href-tags* '("link"))
;;; Mirror-img
;;; ————————————————————————————————————————
;; pathname pathname list → string
2024-05-24 20:58:37 -05:00
(defun mirror-img (html-file base-dir &optional (tags '("img")))
2024-05-24 20:42:21 -05:00
"Attempt to mirror all remote HREF/SRC URLs of an HTML files tags,
downloading them to base-dir. The remote URLs will be replaced with the
local version, where downloading was successful.
Returns a string of the modified pages HTML."
(let* ((dom (lquery:$ (lquery:initialize html-file)))
(urlpathname-alist
(mirror-linked-urls dom (append *src-tags* *href-tags*) base-dir))
(urlrelative-url-alist
(mapcar (lambda (pair) (cons (car pair) (namestring (cdr pair))))
urlpathname-alist)))
(substitute-urls dom
urlrelative-url-alist
(append *src-tags* *href-tags*))
(aref (lquery:$ dom (serialize)) 0)))
;; lquery-dom list pathname → alist
2024-05-24 20:42:21 -05:00
(defun mirror-linked-urls (dom &optional (tags '("link" "img" "script")) base-dir)
"Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery
DOM, downloading them to either the current-working directory or the base-dir.
Returns an associative list containing pairs of remote-URLs to local pathnames.
Any unsuccessfuly-mirrored URL is excluded from the alist.
For example:
((https://downloaded.com/a/b.jpg . #pa/b.jpg))"
(remove-if
#'not
(mapcar (lambda (url)
(let ((download-path
(ensure-directories-exist
(mirrored-pathname url base-dir))))
(if (http-fetch url download-path)
(cons url download-path))))
(linked-urls dom tags))))
;; lquery-dom alist list → nil
2024-05-24 20:42:21 -05:00
(defun substitute-urls (dom substitution-alist &optional (tags '("img")))
"Replace SRC or HREF attributes of certain tags in an LQuery DOM, based on a
substitution associative list. The substitution alist is made up of cons-cells
mapping replacee URLs to substition URLs.
For instance, with the alist cons-cell (https://xwx.moe/b.jpg . a/b.jpg),
any HREF/SRC values of https://xwx.moe/b.jpg will be replaced with a/b.jpg,
among the given tags."
(mapcar
(lambda (tag)
(lquery:$ dom tag
#'(lambda (nodes)
(loop for node across nodes do
(let ((substitution
(cdr (assoc (node-url node)
substitution-alist :test #'equal))))
(if substitution
(set-node-url node substitution)))))))
tags)
dom)
;; string pathname → pathname
2024-05-24 20:42:21 -05:00
(defun mirrored-pathname (url &optional base-dir)
"Given a URL, return the corresponding path we would download it to, were we
to mirror it."
(let* ((url (quri:uri url))
(path (quri:uri-path url)))
(pathname
(format nil "~@[~A/~]~A/~A~@[.~A~]"
base-dir
(quri:uri-host url)
(pathname-name path)
(pathname-type path)))))
;;; DOM-parsing
;;; ————————————————————————————————————————
;; lquery-dom list → list
2024-05-24 20:42:21 -05:00
(defun linked-urls (dom &optional (tags '("link" "img" "script")))
"Return a list of all URLs in the LQuery DOM contained in the given tags
HREF and SRC attributes."
(flet ((url-list-of-tag (tag)
(remove-if
#'not
(append
(coerce (lquery:$ dom tag (attr "src")) 'list)
(coerce (lquery:$ dom tag (attr "href")) 'list)))))
(loop for tag in tags
append (url-list-of-tag tag))))
;; lquery-node → string
2024-05-24 20:42:21 -05:00
(defun node-url (node)
"Return the SRC or HREF attribute of an LQuery node, if such a thing exists."
(and node
(or (lquery-funcs:attr node "src")
(lquery-funcs:attr node "href"))))
;; lquery-node → nil
2024-05-24 20:42:21 -05:00
(defun set-node-url (node url)
"Set the SRC or HREF attribute of an LQuery node (based on which attribute is
already set)."
(and node
(if (lquery-funcs:attr node "src")
(lquery-funcs:attr node "src" url))
(if (lquery-funcs:attr node "href")
(lquery-funcs:attr node "href" url))))
;;; Util
;;; ————————————————————————————————————————
;; string pathname → pathname or nil
2024-05-24 20:42:21 -05:00
(defun http-fetch (url path)
"Download a URL to a path; if successful, returns the pathname. Otherwise, NIL."
(handler-case
(and (dexador:fetch (url-encode-uri (quri:uri url)) path)
(pathname path))
(file-exists ()
path)
(error nil)))
;; string → string
2024-05-24 20:42:21 -05:00
(defun url-encode-uri (uri)
"URL-encode the path component of a URI. For example,
https://invalid.tld/dad alive.jpg https://invalid.tld/dad%20alive.jpg"
(let ((uri (quri:uri uri)))
(format nil "~@[~A://~]~A~@[:~A~]~A"
(quri:uri-scheme uri)
(quri:uri-host uri)
(quri:uri-port uri)
(url-encode-path (quri:uri-path uri)))))
;; string → string
2024-05-24 20:42:21 -05:00
(defun url-encode-path (path)
"URL-encode a pathnames components (directory and filenames). For example,
/images!/dad alive.jpg /images%21/dad%20alive.jpg"
(let* ((split (split-sequence:split-sequence #\/ path))
(path-parts (if (not (car split)) '("") split)))
(reduce
(lambda (a b)
(format nil "~A/~A" a b))
(mapcar #'quri:url-encode path-parts))))