1 ;;; nnweb.el --- retrieving articles via web search engines
3 ;; Copyright (C) 1996-2012 Free Software Foundation, Inc.
5 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
25 ;; Note: You need to have `w3' installed for some functions to work.
29 (eval-when-compile (require 'cl
))
41 (autoload 'w3-parse-buffer
"w3-parse")
45 (defvoo nnweb-directory
(nnheader-concat gnus-directory
"nnweb/")
46 "Where nnweb will save its files.")
48 (defvoo nnweb-type
'google
49 "What search engine type is being used.
50 Valid types include `google', `dejanews', and `gmane'.")
52 (defvar nnweb-type-definition
54 (id .
"http://www.google.com/groups?as_umsgid=%s&hl=en&dmode=source")
55 (result .
"http://groups.google.com/group/%s/msg/%s?dmode=source")
56 (article . nnweb-google-wash-article
)
57 (reference . identity
)
58 (map . nnweb-google-create-mapping
)
59 (search . nnweb-google-search
)
60 (address .
"http://groups.google.com/groups")
61 (base .
"http://groups.google.com")
62 (identifier . nnweb-google-identity
))
63 (dejanews ;; alias of google
64 (id .
"http://www.google.com/groups?as_umsgid=%s&hl=en&dmode=source")
65 (result .
"http://groups.google.com/group/%s/msg/%s?dmode=source")
66 (article . nnweb-google-wash-article
)
67 (reference . identity
)
68 (map . nnweb-google-create-mapping
)
69 (search . nnweb-google-search
)
70 (address .
"http://groups.google.com/groups")
71 (base .
"http://groups.google.com")
72 (identifier . nnweb-google-identity
))
74 (article . nnweb-gmane-wash-article
)
75 (id .
"http://gmane.org/view.php?group=%s")
76 (reference . identity
)
77 (map . nnweb-gmane-create-mapping
)
78 (search . nnweb-gmane-search
)
79 (address .
"http://search.gmane.org/nov.php")
80 (identifier . nnweb-gmane-identity
)))
81 "Type-definition alist.")
83 (defvoo nnweb-search nil
84 "Search string to feed to Google.")
86 (defvoo nnweb-max-hits
999
87 "Maximum number of hits to display.")
89 (defvoo nnweb-ephemeral-p nil
90 "Whether this nnweb server is ephemeral.")
92 ;;; Internal variables
94 (defvoo nnweb-articles nil
)
95 (defvoo nnweb-buffer nil
)
96 (defvoo nnweb-group-alist nil
)
97 (defvoo nnweb-group nil
)
98 (defvoo nnweb-hashtb nil
)
100 ;;; Interface functions
102 (nnoo-define-basics nnweb
)
104 (deffoo nnweb-retrieve-headers
(articles &optional group server fetch-old
)
105 (nnweb-possibly-change-server group server
)
106 (with-current-buffer nntp-server-buffer
108 (let (article header
)
109 (mm-with-unibyte-current-buffer
110 (while (setq article
(pop articles
))
111 (when (setq header
(cadr (assq article nnweb-articles
)))
112 (nnheader-insert-nov header
))))
115 (deffoo nnweb-request-scan
(&optional group server
)
116 (nnweb-possibly-change-server group server
)
117 (if nnweb-ephemeral-p
118 (setq nnweb-hashtb
(gnus-make-hashtable 4095))
119 (unless nnweb-articles
120 (nnweb-read-overview group
)))
121 (funcall (nnweb-definition 'map
))
122 (unless nnweb-ephemeral-p
124 (nnweb-write-overview group
)))
126 (deffoo nnweb-request-group
(group &optional server dont-check info
)
127 (nnweb-possibly-change-server group server
)
128 (unless (or nnweb-ephemeral-p
131 (nnweb-read-overview group
))
133 ((not nnweb-articles
)
134 (nnheader-report 'nnweb
"No matching articles"))
136 (let ((active (if nnweb-ephemeral-p
137 (cons (caar nnweb-articles
)
138 (caar (last nnweb-articles
)))
139 (cadr (assoc group nnweb-group-alist
)))))
140 (nnheader-report 'nnweb
"Opened group %s" group
)
142 "211 %d %d %d %s\n" (length nnweb-articles
)
143 (car active
) (cdr active
) group
)))))
145 (deffoo nnweb-close-group
(group &optional server
)
146 (nnweb-possibly-change-server group server
)
147 (when (gnus-buffer-live-p nnweb-buffer
)
148 (with-current-buffer nnweb-buffer
149 (set-buffer-modified-p nil
)
150 (kill-buffer nnweb-buffer
)))
153 (deffoo nnweb-request-article
(article &optional group server buffer
)
154 (nnweb-possibly-change-server group server
)
155 (with-current-buffer (or buffer nntp-server-buffer
)
156 (let* ((header (cadr (assq article nnweb-articles
)))
157 (url (and header
(mail-header-xref header
))))
159 (mm-with-unibyte-current-buffer
160 (mm-url-insert url
)))
161 (and (stringp article
)
162 (nnweb-definition 'id t
)
163 (let ((fetch (nnweb-definition 'id
))
165 (when (string-match "^<\\(.*\\)>$" article
)
166 (setq art
(match-string 1 article
)))
167 (when (and fetch art
)
168 (setq url
(format fetch
169 (mm-url-form-encode-xwfu art
)))
170 (mm-with-unibyte-current-buffer
172 (if (nnweb-definition 'reference t
)
174 (funcall (nnweb-definition
175 'reference
) article
)))))))
176 (unless nnheader-callback-function
177 (funcall (nnweb-definition 'article
)))
178 (nnheader-report 'nnweb
"Fetched article %s" article
)
179 (cons group
(and (numberp article
) article
))))))
181 (deffoo nnweb-close-server
(&optional server
)
182 (when (and (nnweb-server-opened server
)
183 (gnus-buffer-live-p nnweb-buffer
))
184 (with-current-buffer nnweb-buffer
185 (set-buffer-modified-p nil
)
186 (kill-buffer nnweb-buffer
)))
187 (nnoo-close-server 'nnweb server
))
189 (deffoo nnweb-request-list
(&optional server
)
190 (nnweb-possibly-change-server nil server
)
191 (with-current-buffer nntp-server-buffer
192 (nnmail-generate-active (list (assoc server nnweb-group-alist
)))
195 (deffoo nnweb-request-update-info
(group info
&optional server
))
197 (deffoo nnweb-asynchronous-p
()
200 (deffoo nnweb-request-create-group
(group &optional server args
)
201 (nnweb-possibly-change-server nil server
)
202 (nnweb-request-delete-group group
)
203 (push `(,group
,(cons 1 0)) nnweb-group-alist
)
207 (deffoo nnweb-request-delete-group
(group &optional force server
)
208 (nnweb-possibly-change-server group server
)
209 (gnus-alist-pull group nnweb-group-alist t
)
211 (gnus-delete-file (nnweb-overview-file group
))
214 (nnoo-define-skeleton nnweb
)
216 ;;; Internal functions
218 (defun nnweb-read-overview (group)
219 "Read the overview of GROUP and build the map."
220 (when (file-exists-p (nnweb-overview-file group
))
221 (mm-with-unibyte-buffer
222 (nnheader-insert-file-contents (nnweb-overview-file group
))
223 (goto-char (point-min))
226 (setq header
(nnheader-parse-nov))
228 (push (list (mail-header-number header
)
229 header
(mail-header-xref header
))
231 (nnweb-set-hashtb header
(car nnweb-articles
)))))))
233 (defun nnweb-write-overview (group)
234 "Write the overview file for GROUP."
235 (with-temp-file (nnweb-overview-file group
)
236 (let ((articles nnweb-articles
))
238 (nnheader-insert-nov (cadr (pop articles
)))))))
240 (defun nnweb-set-hashtb (header data
)
241 (gnus-sethash (nnweb-identifier (mail-header-xref header
))
244 (defun nnweb-get-hashtb (url)
245 (gnus-gethash (nnweb-identifier url
) nnweb-hashtb
))
247 (defun nnweb-identifier (ident)
248 (funcall (nnweb-definition 'identifier
) ident
))
250 (defun nnweb-overview-file (group)
251 "Return the name of the overview file of GROUP."
252 (nnheader-concat nnweb-directory group
".overview"))
254 (defun nnweb-write-active ()
255 "Save the active file."
256 (gnus-make-directory nnweb-directory
)
257 (with-temp-file (nnheader-concat nnweb-directory
"active")
258 (prin1 `(setq nnweb-group-alist
',nnweb-group-alist
) (current-buffer))))
260 (defun nnweb-read-active ()
261 "Read the active file."
262 (load (nnheader-concat nnweb-directory
"active") t t t
))
264 (defun nnweb-definition (type &optional noerror
)
265 "Return the definition of TYPE."
266 (let ((def (cdr (assq type
(assq nnweb-type nnweb-type-definition
)))))
269 (error "Undefined definition %s" type
))
272 (defun nnweb-possibly-change-server (&optional group server
)
274 (unless (nnweb-server-opened server
)
275 (nnweb-open-server server
))
277 (unless nnweb-group-alist
280 (setq nnweb-hashtb
(gnus-make-hashtable 4095)))
282 (setq nnweb-group group
)))
284 (defun nnweb-init (server)
285 "Initialize buffers and such."
286 (unless (gnus-buffer-live-p nnweb-buffer
)
289 (nnheader-set-temp-buffer
290 (format " *nnweb %s %s %s*"
291 nnweb-type nnweb-search server
))
292 (mm-disable-multibyte)
296 ;;; groups.google.com
299 (defun nnweb-google-wash-article ()
300 ;; We have Google's masked e-mail addresses here. :-/
301 (let ((case-fold-search t
)
302 (start-re "<pre>[\r\n ]*")
303 (end-re "[\r\n ]*</pre>"))
304 (goto-char (point-min))
306 (or (re-search-forward "The requested message.*could not be found."
308 (not (and (re-search-forward start-re nil t
)
309 (re-search-forward end-re nil t
)))))
310 ;; FIXME: Don't know how to indicate "not found".
311 ;; Should this function throw an error? --rsteib
313 (gnus-message 3 "Requested article not found")
315 (delete-region (point-min)
316 (re-search-forward start-re
))
317 (goto-char (point-min))
318 (delete-region (progn
319 (re-search-forward end-re
)
322 (mm-url-decode-entities))))
324 (defun nnweb-google-parse-1 (&optional Message-ID
)
325 "Parse search result in current buffer."
328 (active (cadr (assoc nnweb-group nnweb-group-alist
)))
329 Subject Score Date Newsgroups From
332 (push (list nnweb-group
(setq active
(cons 1 0)))
334 ;; Go through all the article hits on this page.
335 (goto-char (point-min))
338 "a +href=\"/group/\\([^>\"]+\\)/browse_thread/[^>]+#\\([0-9a-f]+\\)"
340 (setq Newsgroups
(match-string-no-properties 1)
341 ;; Note: Starting with Google Groups 2, `mid' is a Google-internal
342 ;; ID, not a proper Message-ID.
343 mid
(match-string-no-properties 2)
345 (nnweb-definition 'result
) Newsgroups mid
))
346 (narrow-to-region (search-forward ">" nil t
)
347 (search-forward "</a>" nil t
))
348 (mm-url-remove-markup)
349 (mm-url-decode-entities)
350 (setq Subject
(buffer-string))
351 (goto-char (point-max))
353 (narrow-to-region (point)
354 (search-forward "</table" nil t
))
356 (mm-url-remove-markup)
357 (mm-url-decode-entities)
358 (goto-char (point-max))
361 "^\\(?:\\(\\w+\\) \\([0-9]+\\)\\|\\S-+\\)\\(?: \\([0-9]\\{4\\}\\)\\)? by ?\\(.*\\)"
363 (setq Date
(if (match-string 1)
364 (format "%s %s 00:00:00 %s"
368 (substring (current-time-string) -
4)))
369 (current-time-string)))
370 (setq From
(match-string 4)))
373 (unless (nnweb-get-hashtb url
)
377 (make-full-mail-header
378 (cdr active
) (if Newsgroups
379 (concat "(" Newsgroups
") " Subject
)
381 From Date
(or Message-ID mid
)
384 (nnweb-set-hashtb (cadar map
) (car map
))))
387 (defun nnweb-google-reference (id)
388 (let ((map (nnweb-google-parse-1 id
)) header
)
390 (nconc nnweb-articles map
))
391 (when (setq header
(cadar map
))
392 (mm-with-unibyte-current-buffer
393 (mm-url-insert (mail-header-xref header
)))
396 (defun nnweb-google-create-mapping ()
397 "Perform the search and create a number-to-url alist."
398 (with-current-buffer nnweb-buffer
400 (nnheader-message 7 "Searching google...")
401 (when (funcall (nnweb-definition 'search
) nnweb-search
)
406 (nconc nnweb-articles
(nnweb-google-parse-1)))
407 ;; Check if there are more articles to fetch
408 (goto-char (point-min))
410 (if (or (not (re-search-forward
411 "<a [^>]+href=\"\n?\\([^>\" \n\t]+\\)[^<]*<img[^>]+src=[^>]+next"
413 (>= i nnweb-max-hits
))
415 ;; Yup, there are more articles
416 (setq more
(concat (nnweb-definition 'base
) (match-string 1)))
419 (nnheader-message 7 "Searching google...(%d)" i
)
420 (mm-url-insert more
))))
421 ;; Return the articles in the right order.
422 (nnheader-message 7 "Searching google...done")
424 (sort nnweb-articles
'car-less-than-car
))))))
426 (defun nnweb-google-search (search)
429 (nnweb-definition 'address
)
431 (mm-url-encode-www-form-urlencoded
433 ("num" .
,(number-to-string
434 (min 100 nnweb-max-hits
)))
443 (defun nnweb-google-identity (url)
444 "Return an unique identifier based on URL."
445 (if (string-match "selm=\\([^ &>]+\\)" url
)
452 (defun nnweb-gmane-create-mapping ()
453 "Perform the search and create a number-to-url alist."
454 (with-current-buffer nnweb-buffer
455 (let ((case-fold-search t
)
456 (active (or (cadr (assoc nnweb-group nnweb-group-alist
))
460 (nnheader-message 7 "Searching Gmane..." )
461 (when (funcall (nnweb-definition 'search
) nnweb-search
)
462 (goto-char (point-min))
463 ;; Skip the status line
465 ;; Thanks to Olly Betts we now have NOV lines in our buffer!
467 (unless (or (eolp) (looking-at "\x0d"))
468 (let ((header (nnheader-parse-nov)))
469 (let ((xref (mail-header-xref header
))
470 (from (mail-header-from header
))
471 (subject (mail-header-subject header
))
472 (rfc2047-encoding-type 'mime
))
473 (when (string-match " \\([^:]+\\)[:/]\\([0-9]+\\)" xref
)
474 (mail-header-set-xref
476 (format "http://article.gmane.org/%s/%s/raw"
477 (match-string 1 xref
)
478 (match-string 2 xref
))))
480 ;; Add host part to gmane-encrypted addresses
481 (when (string-match "@$" from
)
482 (mail-header-set-from header
483 (concat from
"public.gmane.org")))
485 (mail-header-set-subject header
486 (rfc2047-encode-string subject
))
488 (unless (nnweb-get-hashtb (mail-header-xref header
))
489 (mail-header-set-number header
(incf (cdr active
)))
490 (push (list (mail-header-number header
) header
) map
)
491 (nnweb-set-hashtb (cadar map
) (car map
))))))
493 (nnheader-message 7 "Searching Gmane...done")
495 (sort (nconc nnweb-articles map
) 'car-less-than-car
)))))
497 (defun nnweb-gmane-wash-article ()
498 (let ((case-fold-search t
))
499 (goto-char (point-min))
500 (when (search-forward "<!--X-Head-of-Message-->" nil t
)
501 (delete-region (point-min) (point))
502 (goto-char (point-min))
503 (while (looking-at "^<li><em>\\([^ ]+\\)</em>.*</li>")
504 (replace-match "\\1\\2" t
)
506 (mm-url-remove-markup))))
508 (defun nnweb-gmane-search (search)
511 (nnweb-definition 'address
)
513 (mm-url-encode-www-form-urlencoded
514 `(("query" .
,search
)
515 ("HITSPERPAGE" .
,(number-to-string nnweb-max-hits
))
516 ;;("TOPDOC" . "1000")
518 (setq buffer-file-name nil
)
519 (unless (featurep 'xemacs
) (set-buffer-multibyte t
))
520 (mm-decode-coding-region (point-min) (point-max) 'utf-8
)
523 (defun nnweb-gmane-identity (url)
524 "Return a unique identifier based on URL."
525 (if (string-match "group=\\(.+\\)" url
)
530 ;;; General web/w3 interface utility functions
533 (defun nnweb-insert-html (parse)
534 "Insert HTML based on a w3 parse tree."
536 ;; We used to call nnheader-string-as-multibyte here, but it cannot
537 ;; be right, so I removed it. If a bug shows up because of this change,
538 ;; please do not blindly revert the change, but help me find the real
539 ;; cause of the bug instead. --Stef
541 (insert "<" (symbol-name (car parse
)) " ")
544 (concat (symbol-name (car param
)) "="
546 (if (consp (cdr param
))
552 (mapc 'nnweb-insert-html
(nth 2 parse
))
553 (insert "</" (symbol-name (car parse
)) ">\n")))
555 (defun nnweb-parse-find (type parse
&optional maxdepth
)
556 "Find the element of TYPE in PARSE."
558 (nnweb-parse-find-1 type parse maxdepth
)))
560 (defun nnweb-parse-find-1 (type contents maxdepth
)
561 (when (or (null maxdepth
)
562 (not (zerop maxdepth
)))
563 (when (consp contents
)
564 (when (eq (car contents
) type
)
565 (throw 'found contents
))
566 (when (listp (cdr contents
))
567 (dolist (element contents
)
568 (when (consp element
)
569 (nnweb-parse-find-1 type element
570 (and maxdepth
(1- maxdepth
)))))))))
572 (defun nnweb-parse-find-all (type parse
)
573 "Find all elements of TYPE in PARSE."
575 (nnweb-parse-find-all-1 type parse
)))
577 (defun nnweb-parse-find-all-1 (type contents
)
579 (when (consp contents
)
580 (if (eq (car contents
) type
)
581 (push contents result
)
582 (when (listp (cdr contents
))
583 (dolist (element contents
)
584 (when (consp element
)
586 (nconc result
(nnweb-parse-find-all-1 type element
))))))))
590 (defun nnweb-text (parse)
591 "Return a list of text contents in PARSE."
592 (let ((nnweb-text nil
))
594 (nreverse nnweb-text
)))
596 (defun nnweb-text-1 (contents)
597 (dolist (element contents
)
598 (if (stringp element
)
599 (push element nnweb-text
)
600 (when (and (consp element
)
601 (listp (cdr element
)))
602 (nnweb-text-1 element
)))))
606 ;;; nnweb.el ends here