Compare commits

...

2 Enmetoj

Author SHA1 Message Date
Jaidyn Ann 57bad39463 Update README.md, mention new --source argument 2024-05-31 22:35:08 -05:00
Jaidyn Ann 19e01c762d Add --source & :HTML-URL to resolve relative URLs
Now, relative URLs in the input file can be
mirrored in relation to a given URL.
2024-05-31 22:29:47 -05:00
5 changed files with 71 additions and 26 deletions

View File

@ -6,11 +6,12 @@ file, mirroring its remote images, stylesheets, and other resources.
## Usage ## Usage
``` ```
usage: mirror-img [-h] [-d DIR] [-b BASE] HTML_FILE usage: mirror-img [-h] [-d DIR] [-b BASE] [-s URL] HTML_FILE
mirror-img [-h] [-d DIR] [-b BASE] mirror-img [-h] [-d DIR] [-b BASE] [-s URL]
Available options: Available options:
-h, --help print this help text. -h, --help print this help text.
-b, --base path to mirror directory used in URLs -b, --base ARG path to mirror directory used in URLs
-s, --source ARG URL used to resolve & mirror relative URLs
-d, --downloads ARG directory for all mirrored files -d, --downloads ARG directory for all mirrored files
``` ```
@ -21,12 +22,28 @@ In order to mirror a webpage, you can simply download it and pipe it into mirror
$ curl https://www.gnu.org/philosophy/philosophy.html | mirror-img > philosophy.html $ curl https://www.gnu.org/philosophy/philosophy.html | mirror-img > philosophy.html
``` ```
And now `philosophy.html` will be a fully-local HTML file, with no external resources! And now `philosophy.html` is a fully-local HTML file with no external resources!
All mirrored content will be found in the `mirror/` directory, and all links
have been adjusted accordingly. … at least, it *would* be. Notice how some resources, like the CSS, dont load.
This is because they are defined as *relative* links (e.g., “../style.css”
rather than “https://invalid.tld/style.css”). In order for these to be
mirrored as well, mirror-img needs to somehow know the source URL.
You can use the `--source` argument to provide the source URL, so
relatively-linked resources can be mirrored, too:
```
$ SOURCE_URL="https://www.gnu.org/philosophy/philosophy.html"
$ curl "$SOURCE_URL" | mirror-img --source "$SOURCE_URL" > philosophy.html
```
*Now* were done! All mirrored content will be found in the `mirror/`
directory, and all links have been adjusted accordingly.
---
If youd like to change the download directory, you can use the `--downloads` If youd like to change the download directory, you can use the `--downloads`
argument. To change the directory used in the output HTMLs URLs, you can argument. To change the directory used in the output-HTMLs URLs, you can
use `--base`. use `--base`.
For example, if youd like to mirror files into `/tmp/mirrors/` but have URLs For example, if youd like to mirror files into `/tmp/mirrors/` but have URLs

View File

@ -28,19 +28,25 @@
;;; Mirror-img ;;; Mirror-img
;;; ———————————————————————————————————————— ;;; ————————————————————————————————————————
;; pathname/input-stream pathname list → string ;; pathname/input-stream pathname string string list → string
(defun mirror-img (html-source download-dir &key (url-dir download-dir) (tags *default-tags*)) (defun mirror-img (html-source download-dir
&key html-url
(url-dir download-dir)
(tags *default-tags*))
"Attempt to mirror all remote HREF/SRC URLs of an HTML documents tags, "Attempt to mirror all remote HREF/SRC URLs of an HTML documents tags,
downloading them to DOWNLOAD-DIR. For each remote URL that could be downloading them to DOWNLOAD-DIR. For each remote URL that could be
successfuly mirrored, the remote URLs will be replaced with the local successfuly mirrored, the remote URLs will be replaced with the local
version. The URL used will will be BASE-DIR plus the filename, where version. The URL used will will be BASE-DIR plus the filename, where
BASE-DIR defaults to DOWNLOAD-DIR. BASE-DIR defaults to DOWNLOAD-DIR.
HTML-SOURCE, the document, can be either a pathname or an input stream. HTML-SOURCE, the document, can be either a pathname or an input stream.
If HTML-URL is provided, then relative URLs in the file will be interpreted
in relation to HTML-URL.
Returns a string of the modified pages HTML." Returns a string of the modified pages HTML."
(let* ((dom (let* ((dom
(lquery:$ (lquery:initialize html-source))) (lquery:$ (lquery:initialize html-source)))
(urlpathname-alist (urlpathname-alist
(mirror-linked-urls dom :tags tags :download-dir download-dir)) (mirror-linked-urls dom :tags tags :download-dir download-dir
:html-url html-url))
(urlrelative-url-alist (urlrelative-url-alist
(mapcar (mapcar
(lambda (pair) (lambda (pair)
@ -53,7 +59,7 @@ Returns a string of the modified pages HTML."
(aref (lquery:$ dom (serialize)) 0))) (aref (lquery:$ dom (serialize)) 0)))
;; lquery-dom list pathname → alist ;; lquery-dom list pathname → alist
(defun mirror-linked-urls (dom &key (tags *default-tags*) download-dir) (defun mirror-linked-urls (dom &key (tags *default-tags*) download-dir (html-url ""))
"Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery "Mirror all URLs in the HREF/SRC attributes of the given tags in an LQuery
DOM, downloading them to either the current-working directory or the DOWNLOAD-DIR. DOM, downloading them to either the current-working directory or the DOWNLOAD-DIR.
Returns an associative list containing pairs of remote-URLs to local pathnames. Returns an associative list containing pairs of remote-URLs to local pathnames.
@ -62,12 +68,16 @@ For example:
((https://downloaded.com/a/b.jpg . #pa/b.jpg))" ((https://downloaded.com/a/b.jpg . #pa/b.jpg))"
(remove-if (remove-if
#'not #'not
(mapcar (lambda (url) (mapcar (lambda (original-url)
(let ((download-path (let* ((url (if (not (quri:uri-scheme (quri:uri original-url)))
(ensure-directories-exist (quri:merge-uris original-url html-url) ; For relative URLs
(mirrored-pathname url :base-dir download-dir)))) original-url))
(download-path
(ensure-directories-exist
(mirrored-pathname url :base-dir download-dir))))
(if (http-fetch url download-path) (if (http-fetch url download-path)
(cons url download-path)))) (cons original-url download-path))))
(linked-urls dom :tags tags)))) (linked-urls dom :tags tags))))
;; lquery-dom alist list → nil ;; lquery-dom alist list → nil

View File

@ -41,7 +41,8 @@
(when-opt opts :help (help)) (when-opt opts :help (help))
(let* ((input-stream (choose-input-stream (car free))) (let* ((input-stream (choose-input-stream (car free)))
(url-base (or (getf opts :url-base) (getf opts :download-dir) "mirror/")) (url-base (or (getf opts :url-base) (getf opts :download-dir) "mirror/"))
(download-dir (or (getf opts :download-dir) url-base))) (download-dir (or (getf opts :download-dir) url-base))
(source-url (getf opts :source-url)))
(when (not input-stream) (when (not input-stream)
(error-print 1 "No HTML file provided. See --help for more information.")) (error-print 1 "No HTML file provided. See --help for more information."))
(format (format
@ -49,7 +50,8 @@
(mirror-img:mirror-img (mirror-img:mirror-img
input-stream input-stream
download-dir download-dir
:url-dir url-base)))) :url-dir url-base
:html-url source-url))))
(error (c) (error (c)
(error-print 99 nil c)))) (error-print 99 nil c))))
@ -59,7 +61,20 @@
:short #\h :long "help") :short #\h :long "help")
(:name :url-base (:name :url-base
:description "path to mirror directory used in URLs" :description "path to mirror directory used in URLs"
:short #\b :long "base") :short #\b :long "base"
:arg-parser (lambda (base) base))
(:name :source-url
:description "URL used to resolve & mirror relative URLs"
:short #\s :long "source"
:arg-parser
(lambda (source)
(handler-case
(let ((uri (quri:uri source)))
(if (not (quri:uri-scheme uri))
(error-print 6 "Source URL is not proper (e.g., “https://invalid.tld/”).")
uri))
(error (c)
(error-print 6 "Source URL is not proper (e.g., “https://invalid.tld/”)." c)))))
(:name :download-dir (:name :download-dir
:description "directory for all mirrored files" :description "directory for all mirrored files"
:short #\d :long "downloads" :short #\d :long "downloads"
@ -79,8 +94,8 @@
"Prints help message and dies." "Prints help message and dies."
(unix-opts:describe (unix-opts:describe
:prefix (format nil "~A~%~A" :prefix (format nil "~A~%~A"
"usage: mirror-img [-h] [-d DIR] [-b BASE] HTML_FILE" "usage: mirror-img [-h] [-d DIR] [-b BASE] [-s URL] HTML_FILE"
" mirror-img [-h] [-d DIR] [-b BASE]") " mirror-img [-h] [-d DIR] [-b BASE] [-s URL]")
:stream stream) :stream stream)
(unix-opts:exit exit-code)) (unix-opts:exit exit-code))

View File

@ -51,10 +51,13 @@ systems path."
(setq *mirror-img-result* (setq *mirror-img-result*
(let* ((html-path (relative-pathname "t/testing-website/index.html")) (let* ((html-path (relative-pathname "t/testing-website/index.html"))
(download-dir (relative-pathname "t/tmp/")) (download-dir (relative-pathname "t/tmp/"))
(html-url "http://localhost:4242")
(url-dir (pathname-utils:relative-pathname (url-dir (pathname-utils:relative-pathname
(relative-pathname "t/") (relative-pathname "t/")
download-dir))) download-dir)))
(mirror-img:mirror-img html-path download-dir :url-dir url-dir)))))) (mirror-img:mirror-img html-path download-dir
:url-dir url-dir
:html-url html-url))))))
(define-test mirror-img.files-mirrored (:tags '(mirror-img)) (define-test mirror-img.files-mirrored (:tags '(mirror-img))
(assert-equal (assert-equal
@ -90,13 +93,13 @@ systems path."
(define-test linked-urls (:tags '(dom)) (define-test linked-urls (:tags '(dom))
(assert-equal (assert-equal
'("http://localhost:4242/res/img/b/fireplace.jpg" '("/res/welcoming you.jpg"
"http://localhost:4242/res/img/b/fireplace.jpg"
"http://localhost:4242/res/img/b/Im trying hard to randomly name these directories/more_calming.jpg" "http://localhost:4242/res/img/b/Im trying hard to randomly name these directories/more_calming.jpg"
"http://localhost:4242/res/img/b/Im trying hard to randomly name these directories/ĉu ĉi tio sufiĉe hazardas?!/classy_fireplace.jpg" "http://localhost:4242/res/img/b/Im trying hard to randomly name these directories/ĉu ĉi tio sufiĉe hazardas?!/classy_fireplace.jpg"
"http://localhost:4242/res/img/level-2/café.jpg" "http://localhost:4242/res/img/level-2/café.jpg"
"http://localhost:4242/res/img/merry christmas!!! ^_^.jpg" "http://localhost:4242/res/img/merry christmas!!! ^_^.jpg"
"http://localhost:4242/res/style.css" "http://localhost:4242/res/style.css")
"http://localhost:4242/res/welcoming you.jpg")
(sort (mirror-img::linked-urls (sort (mirror-img::linked-urls
(lquery:$ (lquery:initialize (lquery:$ (lquery:initialize
(relative-pathname "t/testing-website/index.html")))) (relative-pathname "t/testing-website/index.html"))))

View File

@ -11,7 +11,7 @@
<h1>A sultry welcome to you</h1> <h1>A sultry welcome to you</h1>
<figure> <figure>
<img src="http://localhost:4242/res/welcoming you.jpg"> <img src="/res/welcoming you.jpg">
<figcaption>Welcome~ <figcaption>Welcome~
<a href="https://twitter.com/Distrbeachboy/status/1758538124785852426"> <a href="https://twitter.com/Distrbeachboy/status/1758538124785852426">
🔗 🔗