add license
[gelbooru-dl.git] / gelbooru-dl
blob1e3df271ce79d59407309d50bc7e84bf0620b099
1 #!/bin/sh
2 if [ "$1" = "-d" ]; then
3 shift
4 is_daemon=true
5 PIPE="/tmp/gelbooru"
6 else
7 is_daemon=false
8 PIPE="/tmp/gelbooru$$"
9 fi
10 trap 'exit' TERM
11 trap 'kill 0' INT
13 IMG_DIR=.
14 MDATA_DIR=.
15 WRITE_MDATA=false
16 WRITE_IMG=true
17 JSON_FILE=
20 alias ccurl='curl --retry 100000 --retry-delay 4'
23 #1: url
24 #2: output file
25 get() {
26 wget -q --show-progress -O "$2".part "$1"
27 if [ "$?" -ne 0 ]; then
28 echo "\033[31mdownload failed: ${1}\033[0m"
29 exit 1
30 else
31 mv "$2".part "$2"
35 download_cluster() {
36 #exec 3< "$PIPE"
37 #local res=0
38 #while read -r input <&3
39 #do
40 #done
41 #exec 3<&-
42 cat "$PIPE" | xargs -r -P 4 -n 2 "$0" --get
45 #1: attribute
46 #2: metadata
47 get_attr() {
48 echo "$2" | grep "^${1}:" | cut -s -d' ' -f2-
51 #1: tag class
52 #2: metadata
53 get_tags() {
54 get_attr "${1}_tags" "$2"
57 #@: tags
58 list_addr() {
59 echo 'https://gelbooru.com/index.php?page=post&s=list&tags='"$(echo "$@" | sed 's/ /+/g')"
62 #1: list address
63 #2: page
64 get_page() {
65 echo "$1"'&pid='"$(($2 * 42))"
68 #1: query address
69 post_addrs() {
70 ccurl -s "$1" | egrep -o ' class="thumb">.*' | cut -s -d'"' -f6 | sed 's/\&amp;/\&/g' | sed 's/^/https:/g'
73 #1: post address
74 post_id() {
75 printf '%010u' "$(echo "$1" | egrep -o '[^=]+$')"
78 #1: post content
79 image_addr() {
80 echo "$1" | egrep -o '<[^<>]+>Original image</a></li>' | cut -s -d'"' -f2
83 #1: tag type (general artist character copyright)
84 #2: post content
85 extract_tags() {
86 echo "$2" | egrep -o '<li class="tag-type-'"$1"'"><a[^>]*>[^<>]*</a>[^<>]<a[^>]*>[^<>]+' |
87 cut -s -d'>' -f5 | sed 's/#gt;/>/g' | sed "s/&#039;/'/g" | sed 's/ /_/g'
90 #1: post address
91 #2: selection directory
92 download_image() {
93 local content="$(ccurl -s "$1")"
94 local image_url="$(image_addr "$content")"
95 local id="$(post_id "$post")"
96 local filename="${id}.${image_url##*.}"
97 local output_file="${IMG_DIR}/${filename}"
98 if "$WRITE_IMG" && [ ! -f "$output_file" ]; then
99 mkdir -p "$IMG_DIR"
100 echo "${image_url} ${output_file}" >&3
102 if "$WRITE_MDATA"; then
103 local metadata_file="${MDATA_DIR}/${id}"
104 local general_tags="$(extract_tags general "$content")"
105 local artist_tags="$(extract_tags artist "$content")"
106 local character_tags="$(extract_tags character "$content")"
107 local copyright_tags="$(extract_tags copyright "$content")"
108 local metadata_tags="$(extract_tags metadata "$content")"
109 if [ "$JSON_FILE" ]; then
110 echo -n "$(jq -cn \
111 --arg id "$id" --arg url "$image_url" \
112 --arg gl "$general_tags" --arg at "$artist_tags" \
113 --arg cr "$character_tags" --arg ct "$copyright_tags" \
114 --arg ma "$metadata_tags" \
116 id:$id, url:$url, tags:
118 general:$gl|split("\n"), artist:$at|split("\n"),
119 character:$cr|split("\n"), copyright:$ct|split("\n"),
120 metadata:$ma|split("\n")
122 }')," >> "$JSON_FILE"
123 else
124 mkdir -p "$MDATA_DIR"
125 echo "url: $image_url" > "$metadata_file"
126 echo "filename: $filename" >> "$metadata_file"
127 echo 'general_tags:' $general_tags >> "$metadata_file"
128 echo 'artist_tags:' $artist_tags >> "$metadata_file"
129 echo 'character_tags:' $character_tags >> "$metadata_file"
130 echo 'copyright_tags:' $copyright_tags >> "$metadata_file"
131 echo 'metadata_tags:' $metadata_tags >> "$metadata_file"
136 #@: post addresses
137 download_posts() {
138 local post
139 for post in $@
141 (download_image "$post" "$DIR") &
142 done
143 wait
146 #@: tags
147 download_list() {
148 if [ "$JSON_FILE" ]; then
149 echo -n '[' > "$JSON_FILE"
151 local list="$(list_addr $@)"
152 local p=0
153 local posts='init'
154 while [ "$posts" ]
156 posts="$(post_addrs "$(get_page "$list" "$p")")"
157 (download_posts $posts) &
158 p="$((p + 1))"
159 done
160 wait
161 if [ "$JSON_FILE" ]; then
162 echo -n ']' >> "$JSON_FILE"
163 sed -i 's/,]/]/g' "$JSON_FILE"
167 #1: base set
168 #2: removed set
169 minus() {
170 local r
171 local e
172 local ok
173 for e in $1
175 ok=true
176 for r in $2
178 [ "$e" = "$r" ] && {
179 ok=false
180 break
182 done
183 "$ok" && echo "$e"
184 done
187 #1: regex
188 #2-: set
189 filter() {
190 local regex="$1"
191 shift
192 local i
193 for i in $@
195 echo "$i" | egrep -o '^'"${regex}"'$'
196 done
199 #@: tags
200 query_metadata_files() {
201 local files="$(ls "$MDATA_DIR"/*)"
202 local t
203 for t in $(filter '^[^-].*$' $@)
205 files="$(egrep '^[a-z]+_tags:' $files |
206 grep ' '"$t"'\( \|$\)' | cut -s -d ':' -f 1)"
207 done
208 for t in $(filter '^-.*$' $@)
210 files="$(minus "$files" "$(egrep '^[a-z]+_tags:' $files |
211 grep ' '"${t#-}"'\( \|$\)' | cut -s -d ':' -f 1)")"
212 done
213 local f
214 for f in $files
216 echo "$f"
217 done
220 #@: metadata files
221 get_data_files() {
222 local f
223 for f in $@
225 echo "${IMG_DIR}/$(grep '^filename:' "$f" | cut -s -d ' ' -f 2)"
226 done
229 #@: tags
230 query() {
231 (get_data_files $(query_metadata_files $@))
234 #1: tag class
235 #2-: metadata files
236 list_tag() {
237 local class="$1"
238 shift
239 for t in $(grep "$class"'_tags:' $@ | cut -s -d ':' -f 3 | sed 's/ /\n/g' | sort | uniq)
241 echo "${t}:${class}"
242 done
245 #@: tags
246 list_tags() {
247 local mfiles="$(query_metadata_files $@)"
248 for i in $({
249 list_tag general $mfiles
250 list_tag artist $mfiles
251 list_tag character $mfiles
252 list_tag copyright $mfiles
253 list_tag metadata $mfiles
254 } | sort)
256 case "$i" in
257 *:artist) echo "\033[31m${i}\033[0m";;
258 *:character) echo "\033[34m${i}\033[0m";;
259 *:copyright) echo "\033[33m${i}\033[0m";;
260 *:metadata) echo "\033[91m${i}\033[0m";;
261 *) echo "$i";;
262 esac
263 done
266 sync() {
267 local files="$(ls "$MDATA_DIR"/*)"
268 local f
269 for f in $files
271 local metadata="$(cat "$f")"
272 local file="${IMG_DIR}/$(get_attr filename "$metadata")"
273 [ "$file" ] && [ ! -f "$file" ] && {
274 local url="$(get_attr url "$metadata")"
275 [ "$url" ] && echo "${url} ${file}" >&3
277 done
280 usage() {
281 echo 'usage:'
282 echo " $0 [-m \033[4mmetadata directory\033[0m] [-o \033[4mouput directory\033[0m] \033[4mtags...\033[0m"
283 echo " $0 --get \033[4murl\033[0m \033[4moutput filename\033[0m"
286 using_pipe() {
287 exec 3> "$PIPE"
288 ($@)
289 exec 3>&-
292 using_download_cluster() {
293 if "$is_daemon" && [ -p "$PIPE" ]; then
294 (using_pipe $@)
295 else
296 mkfifo "$PIPE"
297 trap 'rm -f "$PIPE"' EXIT
298 (using_pipe $@) &
299 (download_cluster)
303 if [ $# -eq 3 ] && [ "$1" = '--get' ]; then
304 shift
305 url="$1"
306 shift
307 file="$@"
308 (get "$url" "$file")
309 else
310 action=
311 while getopts 'tsqho:m:nj:g' arg
313 case "$arg" in
314 o) IMG_DIR="$OPTARG";;
316 MDATA_DIR="$OPTARG"
317 WRITE_MDATA=true
319 n) WRITE_IMG=false;;
320 q) action=query;;
321 s) action=sync;;
322 t) action=tags;;
324 WRITE_MDATA=true
325 JSON_FILE="$OPTARG"
327 g) action=get;;
329 usage
330 exit 0
333 usage
334 exit 1
336 esac
337 done
338 shift "$(($OPTIND - 1))"
339 case "$action" in
340 query) (query $@);;
341 tags) (list_tags $@);;
342 sync) (using_download_cluster sync $@);;
343 get) (using_download_cluster download_posts $@);;
344 *) (using_download_cluster download_list $@);;
346 esac