Add --source & :HTML-URL to resolve relative URLs

Now, relative URLs in the input file can be
mirrored in relation to a given URL.
This commit is contained in:
Jaidyn Ann 2024-05-31 22:29:47 -05:00
parent f22dbba40d
commit 19e01c762d
4 changed files with 47 additions and 19 deletions

View File

@ -28,19 +28,25 @@
;;; Mirror-img ;;; Mirror-img
;;; ———————————————————————————————————————— ;;; ————————————————————————————————————————
;; pathname/input-stream pathname list → string ;; pathname/input-stream pathname string string list → string
(defun mirror-img (html-source download-dir &key (url-dir download-dir) (tags *default-tags*)) (defun mirror-img (html-source download-dir
&key html-url
(url-dir download-dir)
(tags *default-tags*))
"Attempt to mirror all remote HREF/SRC URLs of an HTML documents tags, "Attempt to mirror all remote HREF/SRC URLs of an HTML documents tags,
downloading them to DOWNLOAD-DIR. For each remote URL that could be downloading them to DOWNLOAD-DIR. For each remote URL that could be
successfuly mirrored, the remote URLs will be replaced with the local successfuly mirrored, the remote URLs will be replaced with the local
version. The URL used will will be BASE-DIR plus the filename, where version. The URL used will will be BASE-DIR plus the filename, where
BASE-DIR defaults to DOWNLOAD-DIR. BASE-DIR defaults to DOWNLOAD-DIR.
HTML-SOURCE, the document, can be either a pathname or an input stream. HTML-SOURCE, the document, can be either a pathname or an input stream.
If HTML-URL is provided, then relative URLs in the file will be interpreted
in relation to HTML-URL.
Returns a string of the modified pages HTML." Returns a string of the modified pages HTML."
(let* ((dom (let* ((dom
(lquery:$ (lquery:initialize html-source))) (lquery:$ (lquery:initialize html-source)))
(urlpathname-alist (urlpathname-alist
(mirror-linked-urls dom :tags tags :download-dir download-dir)) (mirror-linked-urls dom :tags tags :download-dir download-dir
:html-url html-url))
(urlrelative-url-alist (urlrelative-url-alist
(mapcar (mapcar
(lambda (pair) (lambda (pair)
@ -53,7 +59,7 @@ Returns a string of the modified pages HTML."
(aref (lquery:$ dom (serialize)) 0))) (aref (lquery:$ dom (serialize)) 0)))
;; lquery-dom list pathname → alist ;; lquery-dom list pathname → alist
(defun mirror-linked-urls (dom &key (tags *default-tags*) download-dir) (defun mirror-linked-urls (dom &key (tags *default-tags*) download-dir (html-url ""))
"Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery "Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery
DOM, downloading them to either the current-working directory or the DOWNLOAD-DIR. DOM, downloading them to either the current-working directory or the DOWNLOAD-DIR.
Returns an associative list containing pairs of remote-URLs to local pathnames. Returns an associative list containing pairs of remote-URLs to local pathnames.
@ -62,12 +68,16 @@ For example:
((https://downloaded.com/a/b.jpg . #pa/b.jpg))" ((https://downloaded.com/a/b.jpg . #pa/b.jpg))"
(remove-if (remove-if
#'not #'not
(mapcar (lambda (url) (mapcar (lambda (original-url)
(let ((download-path (let* ((url (if (not (quri:uri-scheme (quri:uri original-url)))
(ensure-directories-exist (quri:merge-uris original-url html-url) ; For relative URLs
(mirrored-pathname url :base-dir download-dir)))) original-url))
(download-path
(ensure-directories-exist
(mirrored-pathname url :base-dir download-dir))))
(if (http-fetch url download-path) (if (http-fetch url download-path)
(cons url download-path)))) (cons original-url download-path))))
(linked-urls dom :tags tags)))) (linked-urls dom :tags tags))))
;; lquery-dom alist list → nil ;; lquery-dom alist list → nil

View File

@ -41,7 +41,8 @@
(when-opt opts :help (help)) (when-opt opts :help (help))
(let* ((input-stream (choose-input-stream (car free))) (let* ((input-stream (choose-input-stream (car free)))
(url-base (or (getf opts :url-base) (getf opts :download-dir) "mirror/")) (url-base (or (getf opts :url-base) (getf opts :download-dir) "mirror/"))
(download-dir (or (getf opts :download-dir) url-base))) (download-dir (or (getf opts :download-dir) url-base))
(source-url (getf opts :source-url)))
(when (not input-stream) (when (not input-stream)
(error-print 1 "No HTML file provided. See --help for more information.")) (error-print 1 "No HTML file provided. See --help for more information."))
(format (format
@ -49,7 +50,8 @@
(mirror-img:mirror-img (mirror-img:mirror-img
input-stream input-stream
download-dir download-dir
:url-dir url-base)))) :url-dir url-base
:html-url source-url))))
(error (c) (error (c)
(error-print 99 nil c)))) (error-print 99 nil c))))
@ -59,7 +61,20 @@
:short #\h :long "help") :short #\h :long "help")
(:name :url-base (:name :url-base
:description "path to mirror directory used in URLs" :description "path to mirror directory used in URLs"
:short #\b :long "base") :short #\b :long "base"
:arg-parser (lambda (base) base))
(:name :source-url
:description "URL used to resolve & mirror relative URLs"
:short #\s :long "source"
:arg-parser
(lambda (source)
(handler-case
(let ((uri (quri:uri source)))
(if (not (quri:uri-scheme uri))
(error-print 6 "Source URL is not proper (e.g., “https://invalid.tld/”).")
uri))
(error (c)
(error-print 6 "Source URL is not proper (e.g., “https://invalid.tld/”)." c)))))
(:name :download-dir (:name :download-dir
:description "directory for all mirrored files" :description "directory for all mirrored files"
:short #\d :long "downloads" :short #\d :long "downloads"
@ -79,8 +94,8 @@
"Prints help message and dies." "Prints help message and dies."
(unix-opts:describe (unix-opts:describe
:prefix (format nil "~A~%~A" :prefix (format nil "~A~%~A"
"usage: mirror-img [-h] [-d DIR] [-b BASE] HTML_FILE" "usage: mirror-img [-h] [-d DIR] [-b BASE] [-s URL] HTML_FILE"
" mirror-img [-h] [-d DIR] [-b BASE]") " mirror-img [-h] [-d DIR] [-b BASE] [-s URL]")
:stream stream) :stream stream)
(unix-opts:exit exit-code)) (unix-opts:exit exit-code))

View File

@ -51,10 +51,13 @@ systems path."
(setq *mirror-img-result* (setq *mirror-img-result*
(let* ((html-path (relative-pathname "t/testing-website/index.html")) (let* ((html-path (relative-pathname "t/testing-website/index.html"))
(download-dir (relative-pathname "t/tmp/")) (download-dir (relative-pathname "t/tmp/"))
(html-url "http://localhost:4242")
(url-dir (pathname-utils:relative-pathname (url-dir (pathname-utils:relative-pathname
(relative-pathname "t/") (relative-pathname "t/")
download-dir))) download-dir)))
(mirror-img:mirror-img html-path download-dir :url-dir url-dir)))))) (mirror-img:mirror-img html-path download-dir
:url-dir url-dir
:html-url html-url))))))
(define-test mirror-img.files-mirrored (:tags '(mirror-img)) (define-test mirror-img.files-mirrored (:tags '(mirror-img))
(assert-equal (assert-equal
@ -90,13 +93,13 @@ systems path."
(define-test linked-urls (:tags '(dom)) (define-test linked-urls (:tags '(dom))
(assert-equal (assert-equal
'("http://localhost:4242/res/img/b/fireplace.jpg" '("/res/welcoming you.jpg"
"http://localhost:4242/res/img/b/fireplace.jpg"
"http://localhost:4242/res/img/b/Im trying hard to randomly name these directories/more_calming.jpg" "http://localhost:4242/res/img/b/Im trying hard to randomly name these directories/more_calming.jpg"
"http://localhost:4242/res/img/b/Im trying hard to randomly name these directories/ĉu ĉi tio sufiĉe hazardas?!/classy_fireplace.jpg" "http://localhost:4242/res/img/b/Im trying hard to randomly name these directories/ĉu ĉi tio sufiĉe hazardas?!/classy_fireplace.jpg"
"http://localhost:4242/res/img/level-2/café.jpg" "http://localhost:4242/res/img/level-2/café.jpg"
"http://localhost:4242/res/img/merry christmas!!! ^_^.jpg" "http://localhost:4242/res/img/merry christmas!!! ^_^.jpg"
"http://localhost:4242/res/style.css" "http://localhost:4242/res/style.css")
"http://localhost:4242/res/welcoming you.jpg")
(sort (mirror-img::linked-urls (sort (mirror-img::linked-urls
(lquery:$ (lquery:initialize (lquery:$ (lquery:initialize
(relative-pathname "t/testing-website/index.html")))) (relative-pathname "t/testing-website/index.html"))))

View File

@ -11,7 +11,7 @@
<h1>A sultry welcome to you</h1> <h1>A sultry welcome to you</h1>
<figure> <figure>
<img src="http://localhost:4242/res/welcoming you.jpg"> <img src="/res/welcoming you.jpg">
<figcaption>Welcome~ <figcaption>Welcome~
<a href="https://twitter.com/Distrbeachboy/status/1758538124785852426"> <a href="https://twitter.com/Distrbeachboy/status/1758538124785852426">
🔗 🔗