Add --source & :HTML-URL to resolve relative URLs

Now, relative URLs in the input file can be
mirrored in relation to a given URL.
This commit is contained in:
Jaidyn Ann 2024-05-31 22:29:47 -05:00
parent f22dbba40d
commit 19e01c762d
4 changed files with 47 additions and 19 deletions

View File

@ -28,19 +28,25 @@
;;; Mirror-img
;;; ————————————————————————————————————————
;; pathname/input-stream pathname list → string
(defun mirror-img (html-source download-dir &key (url-dir download-dir) (tags *default-tags*))
;; pathname/input-stream pathname string string list → string
(defun mirror-img (html-source download-dir
&key html-url
(url-dir download-dir)
(tags *default-tags*))
"Attempt to mirror all remote HREF/SRC URLs of an HTML documents tags,
downloading them to DOWNLOAD-DIR. For each remote URL that could be
successfuly mirrored, the remote URLs will be replaced with the local
version. The URL used will will be BASE-DIR plus the filename, where
BASE-DIR defaults to DOWNLOAD-DIR.
HTML-SOURCE, the document, can be either a pathname or an input stream.
If HTML-URL is provided, then relative URLs in the file will be interpreted
in relation to HTML-URL.
Returns a string of the modified pages HTML."
(let* ((dom
(lquery:$ (lquery:initialize html-source)))
(urlpathname-alist
(mirror-linked-urls dom :tags tags :download-dir download-dir))
(mirror-linked-urls dom :tags tags :download-dir download-dir
:html-url html-url))
(urlrelative-url-alist
(mapcar
(lambda (pair)
@ -53,7 +59,7 @@ Returns a string of the modified pages HTML."
(aref (lquery:$ dom (serialize)) 0)))
;; lquery-dom list pathname → alist
(defun mirror-linked-urls (dom &key (tags *default-tags*) download-dir)
(defun mirror-linked-urls (dom &key (tags *default-tags*) download-dir (html-url ""))
"Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery
DOM, downloading them to either the current-working directory or the DOWNLOAD-DIR.
Returns an associative list containing pairs of remote-URLs to local pathnames.
@ -62,12 +68,16 @@ For example:
((https://downloaded.com/a/b.jpg . #pa/b.jpg))"
(remove-if
#'not
(mapcar (lambda (url)
(let ((download-path
(ensure-directories-exist
(mirrored-pathname url :base-dir download-dir))))
(mapcar (lambda (original-url)
(let* ((url (if (not (quri:uri-scheme (quri:uri original-url)))
(quri:merge-uris original-url html-url) ; For relative URLs
original-url))
(download-path
(ensure-directories-exist
(mirrored-pathname url :base-dir download-dir))))
(if (http-fetch url download-path)
(cons url download-path))))
(cons original-url download-path))))
(linked-urls dom :tags tags))))
;; lquery-dom alist list → nil

View File

@ -41,7 +41,8 @@
(when-opt opts :help (help))
(let* ((input-stream (choose-input-stream (car free)))
(url-base (or (getf opts :url-base) (getf opts :download-dir) "mirror/"))
(download-dir (or (getf opts :download-dir) url-base)))
(download-dir (or (getf opts :download-dir) url-base))
(source-url (getf opts :source-url)))
(when (not input-stream)
(error-print 1 "No HTML file provided. See --help for more information."))
(format
@ -49,7 +50,8 @@
(mirror-img:mirror-img
input-stream
download-dir
:url-dir url-base))))
:url-dir url-base
:html-url source-url))))
(error (c)
(error-print 99 nil c))))
@ -59,7 +61,20 @@
:short #\h :long "help")
(:name :url-base
:description "path to mirror directory used in URLs"
:short #\b :long "base")
:short #\b :long "base"
:arg-parser (lambda (base) base))
(:name :source-url
:description "URL used to resolve & mirror relative URLs"
:short #\s :long "source"
:arg-parser
(lambda (source)
(handler-case
(let ((uri (quri:uri source)))
(if (not (quri:uri-scheme uri))
(error-print 6 "Source URL is not proper (e.g., “https://invalid.tld/”).")
uri))
(error (c)
(error-print 6 "Source URL is not proper (e.g., “https://invalid.tld/”)." c)))))
(:name :download-dir
:description "directory for all mirrored files"
:short #\d :long "downloads"
@ -79,8 +94,8 @@
"Prints help message and dies."
(unix-opts:describe
:prefix (format nil "~A~%~A"
"usage: mirror-img [-h] [-d DIR] [-b BASE] HTML_FILE"
" mirror-img [-h] [-d DIR] [-b BASE]")
"usage: mirror-img [-h] [-d DIR] [-b BASE] [-s URL] HTML_FILE"
" mirror-img [-h] [-d DIR] [-b BASE] [-s URL]")
:stream stream)
(unix-opts:exit exit-code))

View File

@ -51,10 +51,13 @@ systems path."
(setq *mirror-img-result*
(let* ((html-path (relative-pathname "t/testing-website/index.html"))
(download-dir (relative-pathname "t/tmp/"))
(html-url "http://localhost:4242")
(url-dir (pathname-utils:relative-pathname
(relative-pathname "t/")
download-dir)))
(mirror-img:mirror-img html-path download-dir :url-dir url-dir))))))
(mirror-img:mirror-img html-path download-dir
:url-dir url-dir
:html-url html-url))))))
(define-test mirror-img.files-mirrored (:tags '(mirror-img))
(assert-equal
@ -90,13 +93,13 @@ systems path."
(define-test linked-urls (:tags '(dom))
(assert-equal
'("http://localhost:4242/res/img/b/fireplace.jpg"
'("/res/welcoming you.jpg"
"http://localhost:4242/res/img/b/fireplace.jpg"
"http://localhost:4242/res/img/b/Im trying hard to randomly name these directories/more_calming.jpg"
"http://localhost:4242/res/img/b/Im trying hard to randomly name these directories/ĉu ĉi tio sufiĉe hazardas?!/classy_fireplace.jpg"
"http://localhost:4242/res/img/level-2/café.jpg"
"http://localhost:4242/res/img/merry christmas!!! ^_^.jpg"
"http://localhost:4242/res/style.css"
"http://localhost:4242/res/welcoming you.jpg")
"http://localhost:4242/res/style.css")
(sort (mirror-img::linked-urls
(lquery:$ (lquery:initialize
(relative-pathname "t/testing-website/index.html"))))

View File

@ -11,7 +11,7 @@
<h1>A sultry welcome to you</h1>
<figure>
<img src="http://localhost:4242/res/welcoming you.jpg">
<img src="/res/welcoming you.jpg">
<figcaption>Welcome~
<a href="https://twitter.com/Distrbeachboy/status/1758538124785852426">
🔗