1 ;;; nnweb.el --- retrieving articles via web search engines
3 ;; Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
4 ;; 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
6 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
9 ;; This file is part of GNU Emacs.
11 ;; GNU Emacs is free software; you can redistribute it and/or modify
12 ;; it under the terms of the GNU General Public License as published by
13 ;; the Free Software Foundation; either version 3, or (at your option)
16 ;; GNU Emacs is distributed in the hope that it will be useful,
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;; GNU General Public License for more details.
21 ;; You should have received a copy of the GNU General Public License
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the
23 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
24 ;; Boston, MA 02110-1301, USA.
28 ;; Note: You need to have `w3' installed for some functions to work.
32 (eval-when-compile (require 'cl
))
44 (autoload 'w3-parse-buffer
"w3-parse")
48 (defvoo nnweb-directory
(nnheader-concat gnus-directory
"nnweb/")
49 "Where nnweb will save its files.")
51 (defvoo nnweb-type
'google
52 "What search engine type is being used.
53 Valid types include `google', `dejanews', and `gmane'.")
55 (defvar nnweb-type-definition
57 (id .
"http://www.google.com/groups?as_umsgid=%s&hl=en&dmode=source")
58 (result .
"http://groups.google.com/group/%s/msg/%s?dmode=source")
59 (article . nnweb-google-wash-article
)
60 (reference . identity
)
61 (map . nnweb-google-create-mapping
)
62 (search . nnweb-google-search
)
63 (address .
"http://groups.google.com/groups")
64 (base .
"http://groups.google.com")
65 (identifier . nnweb-google-identity
))
66 (dejanews ;; alias of google
67 (id .
"http://www.google.com/groups?as_umsgid=%s&hl=en&dmode=source")
68 (result .
"http://groups.google.com/group/%s/msg/%s?dmode=source")
69 (article . nnweb-google-wash-article
)
70 (reference . identity
)
71 (map . nnweb-google-create-mapping
)
72 (search . nnweb-google-search
)
73 (address .
"http://groups.google.com/groups")
74 (base .
"http://groups.google.com")
75 (identifier . nnweb-google-identity
))
77 (article . nnweb-gmane-wash-article
)
78 (id .
"http://gmane.org/view.php?group=%s")
79 (reference . identity
)
80 (map . nnweb-gmane-create-mapping
)
81 (search . nnweb-gmane-search
)
82 (address .
"http://search.gmane.org/nov.php")
83 (identifier . nnweb-gmane-identity
)))
84 "Type-definition alist.")
86 (defvoo nnweb-search nil
87 "Search string to feed to Google.")
89 (defvoo nnweb-max-hits
999
90 "Maximum number of hits to display.")
92 (defvoo nnweb-ephemeral-p nil
93 "Whether this nnweb server is ephemeral.")
95 ;;; Internal variables
97 (defvoo nnweb-articles nil
)
98 (defvoo nnweb-buffer nil
)
99 (defvoo nnweb-group-alist nil
)
100 (defvoo nnweb-group nil
)
101 (defvoo nnweb-hashtb nil
)
103 ;;; Interface functions
105 (nnoo-define-basics nnweb
)
107 (deffoo nnweb-retrieve-headers
(articles &optional group server fetch-old
)
108 (nnweb-possibly-change-server group server
)
110 (set-buffer nntp-server-buffer
)
112 (let (article header
)
113 (mm-with-unibyte-current-buffer
114 (while (setq article
(pop articles
))
115 (when (setq header
(cadr (assq article nnweb-articles
)))
116 (nnheader-insert-nov header
))))
119 (deffoo nnweb-request-scan
(&optional group server
)
120 (nnweb-possibly-change-server group server
)
121 (if nnweb-ephemeral-p
122 (setq nnweb-hashtb
(gnus-make-hashtable 4095))
123 (unless nnweb-articles
124 (nnweb-read-overview group
)))
125 (funcall (nnweb-definition 'map
))
126 (unless nnweb-ephemeral-p
128 (nnweb-write-overview group
)))
130 (deffoo nnweb-request-group
(group &optional server dont-check
)
131 (nnweb-possibly-change-server group server
)
132 (unless (or nnweb-ephemeral-p
135 (nnweb-read-overview group
))
137 ((not nnweb-articles
)
138 (nnheader-report 'nnweb
"No matching articles"))
140 (let ((active (if nnweb-ephemeral-p
141 (cons (caar nnweb-articles
)
142 (caar (last nnweb-articles
)))
143 (cadr (assoc group nnweb-group-alist
)))))
144 (nnheader-report 'nnweb
"Opened group %s" group
)
146 "211 %d %d %d %s\n" (length nnweb-articles
)
147 (car active
) (cdr active
) group
)))))
149 (deffoo nnweb-close-group
(group &optional server
)
150 (nnweb-possibly-change-server group server
)
151 (when (gnus-buffer-live-p nnweb-buffer
)
153 (set-buffer nnweb-buffer
)
154 (set-buffer-modified-p nil
)
155 (kill-buffer nnweb-buffer
)))
158 (deffoo nnweb-request-article
(article &optional group server buffer
)
159 (nnweb-possibly-change-server group server
)
161 (set-buffer (or buffer nntp-server-buffer
))
162 (let* ((header (cadr (assq article nnweb-articles
)))
163 (url (and header
(mail-header-xref header
))))
165 (mm-with-unibyte-current-buffer
166 (mm-url-insert url
)))
167 (and (stringp article
)
168 (nnweb-definition 'id t
)
169 (let ((fetch (nnweb-definition 'id
))
171 (when (string-match "^<\\(.*\\)>$" article
)
172 (setq art
(match-string 1 article
)))
173 (when (and fetch art
)
174 (setq url
(format fetch
175 (mm-url-form-encode-xwfu art
)))
176 (mm-with-unibyte-current-buffer
178 (if (nnweb-definition 'reference t
)
180 (funcall (nnweb-definition
181 'reference
) article
)))))))
182 (unless nnheader-callback-function
183 (funcall (nnweb-definition 'article
)))
184 (nnheader-report 'nnweb
"Fetched article %s" article
)
185 (cons group
(and (numberp article
) article
))))))
187 (deffoo nnweb-close-server
(&optional server
)
188 (when (and (nnweb-server-opened server
)
189 (gnus-buffer-live-p nnweb-buffer
))
191 (set-buffer nnweb-buffer
)
192 (set-buffer-modified-p nil
)
193 (kill-buffer nnweb-buffer
)))
194 (nnoo-close-server 'nnweb server
))
196 (deffoo nnweb-request-list
(&optional server
)
197 (nnweb-possibly-change-server nil server
)
199 (set-buffer nntp-server-buffer
)
200 (nnmail-generate-active (list (assoc server nnweb-group-alist
)))
203 (deffoo nnweb-request-update-info
(group info
&optional server
)
204 (nnweb-possibly-change-server group server
))
206 (deffoo nnweb-asynchronous-p
()
209 (deffoo nnweb-request-create-group
(group &optional server args
)
210 (nnweb-possibly-change-server nil server
)
211 (nnweb-request-delete-group group
)
212 (push `(,group
,(cons 1 0)) nnweb-group-alist
)
216 (deffoo nnweb-request-delete-group
(group &optional force server
)
217 (nnweb-possibly-change-server group server
)
218 (gnus-pull group nnweb-group-alist t
)
220 (gnus-delete-file (nnweb-overview-file group
))
223 (nnoo-define-skeleton nnweb
)
225 ;;; Internal functions
227 (defun nnweb-read-overview (group)
228 "Read the overview of GROUP and build the map."
229 (when (file-exists-p (nnweb-overview-file group
))
230 (mm-with-unibyte-buffer
231 (nnheader-insert-file-contents (nnweb-overview-file group
))
232 (goto-char (point-min))
235 (setq header
(nnheader-parse-nov))
237 (push (list (mail-header-number header
)
238 header
(mail-header-xref header
))
240 (nnweb-set-hashtb header
(car nnweb-articles
)))))))
242 (defun nnweb-write-overview (group)
243 "Write the overview file for GROUP."
244 (with-temp-file (nnweb-overview-file group
)
245 (let ((articles nnweb-articles
))
247 (nnheader-insert-nov (cadr (pop articles
)))))))
249 (defun nnweb-set-hashtb (header data
)
250 (gnus-sethash (nnweb-identifier (mail-header-xref header
))
253 (defun nnweb-get-hashtb (url)
254 (gnus-gethash (nnweb-identifier url
) nnweb-hashtb
))
256 (defun nnweb-identifier (ident)
257 (funcall (nnweb-definition 'identifier
) ident
))
259 (defun nnweb-overview-file (group)
260 "Return the name of the overview file of GROUP."
261 (nnheader-concat nnweb-directory group
".overview"))
263 (defun nnweb-write-active ()
264 "Save the active file."
265 (gnus-make-directory nnweb-directory
)
266 (with-temp-file (nnheader-concat nnweb-directory
"active")
267 (prin1 `(setq nnweb-group-alist
',nnweb-group-alist
) (current-buffer))))
269 (defun nnweb-read-active ()
270 "Read the active file."
271 (load (nnheader-concat nnweb-directory
"active") t t t
))
273 (defun nnweb-definition (type &optional noerror
)
274 "Return the definition of TYPE."
275 (let ((def (cdr (assq type
(assq nnweb-type nnweb-type-definition
)))))
278 (error "Undefined definition %s" type
))
281 (defun nnweb-possibly-change-server (&optional group server
)
283 (unless (nnweb-server-opened server
)
284 (nnweb-open-server server
))
286 (unless nnweb-group-alist
289 (setq nnweb-hashtb
(gnus-make-hashtable 4095)))
291 (setq nnweb-group group
)))
293 (defun nnweb-init (server)
294 "Initialize buffers and such."
295 (unless (gnus-buffer-live-p nnweb-buffer
)
299 (nnheader-set-temp-buffer
300 (format " *nnweb %s %s %s*"
301 nnweb-type nnweb-search server
))
302 (current-buffer))))))
305 ;;; groups.google.com
308 (defun nnweb-google-wash-article ()
309 ;; We have Google's masked e-mail addresses here. :-/
310 (let ((case-fold-search t
)
311 (start-re "<pre>[\r\n ]*")
312 (end-re "[\r\n ]*</pre>"))
313 (goto-char (point-min))
315 (or (re-search-forward "The requested message.*could not be found."
317 (not (and (re-search-forward start-re nil t
)
318 (re-search-forward end-re nil t
)))))
319 ;; FIXME: Don't know how to indicate "not found".
320 ;; Should this function throw an error? --rsteib
322 (gnus-message 3 "Requested article not found")
324 (delete-region (point-min)
325 (re-search-forward start-re
))
326 (goto-char (point-min))
327 (delete-region (progn
328 (re-search-forward end-re
)
331 (mm-url-decode-entities))))
333 (defun nnweb-google-parse-1 (&optional Message-ID
)
334 "Parse search result in current buffer."
337 (active (cadr (assoc nnweb-group nnweb-group-alist
)))
338 Subject Score Date Newsgroups From
341 (push (list nnweb-group
(setq active
(cons 1 0)))
343 ;; Go through all the article hits on this page.
344 (goto-char (point-min))
347 "a +href=\"/group/\\([^>\"]+\\)/browse_thread/[^>]+#\\([0-9a-f]+\\)"
349 (setq Newsgroups
(match-string-no-properties 1)
350 ;; Note: Starting with Google Groups 2, `mid' is a Google-internal
351 ;; ID, not a proper Message-ID.
352 mid
(match-string-no-properties 2)
354 (nnweb-definition 'result
) Newsgroups mid
))
355 (narrow-to-region (search-forward ">" nil t
)
356 (search-forward "</a>" nil t
))
357 (mm-url-remove-markup)
358 (mm-url-decode-entities)
359 (setq Subject
(buffer-string))
360 (goto-char (point-max))
362 (narrow-to-region (point)
363 (search-forward "</table" nil t
))
365 (mm-url-remove-markup)
366 (mm-url-decode-entities)
367 (goto-char (point-max))
370 "^\\(?:\\(\\w+\\) \\([0-9]+\\)\\|\\S-+\\)\\(?: \\([0-9]\\{4\\}\\)\\)? by \\(.*\\)"
372 (setq Date
(if (match-string 1)
373 (format "%s %s 00:00:00 %s"
377 (substring (current-time-string) -
4)))
378 (current-time-string)))
379 (setq From
(match-string 4)))
382 (unless (nnweb-get-hashtb url
)
386 (make-full-mail-header
387 (cdr active
) (if Newsgroups
388 (concat "(" Newsgroups
") " Subject
)
390 From Date
(or Message-ID mid
)
393 (nnweb-set-hashtb (cadar map
) (car map
))))
396 (defun nnweb-google-reference (id)
397 (let ((map (nnweb-google-parse-1 id
)) header
)
399 (nconc nnweb-articles map
))
400 (when (setq header
(cadar map
))
401 (mm-with-unibyte-current-buffer
402 (mm-url-insert (mail-header-xref header
)))
405 (defun nnweb-google-create-mapping ()
406 "Perform the search and create a number-to-url alist."
408 (set-buffer nnweb-buffer
)
410 (nnheader-message 7 "Searching google...")
411 (when (funcall (nnweb-definition 'search
) nnweb-search
)
416 (nconc nnweb-articles
(nnweb-google-parse-1)))
417 ;; Check if there are more articles to fetch
418 (goto-char (point-min))
420 (if (or (not (re-search-forward
421 "<a [^>]+href=\"\n?\\([^>\" \n\t]+\\)[^<]*<img[^>]+src=[^>]+next"
423 (>= i nnweb-max-hits
))
425 ;; Yup, there are more articles
426 (setq more
(concat (nnweb-definition 'base
) (match-string 1)))
429 (nnheader-message 7 "Searching google...(%d)" i
)
430 (mm-url-insert more
))))
431 ;; Return the articles in the right order.
432 (nnheader-message 7 "Searching google...done")
434 (sort nnweb-articles
'car-less-than-car
))))))
436 (defun nnweb-google-search (search)
439 (nnweb-definition 'address
)
441 (mm-url-encode-www-form-urlencoded
443 ("num" .
,(number-to-string
444 (min 100 nnweb-max-hits
)))
453 (defun nnweb-google-identity (url)
454 "Return an unique identifier based on URL."
455 (if (string-match "selm=\\([^ &>]+\\)" url
)
462 (defun nnweb-gmane-create-mapping ()
463 "Perform the search and create a number-to-url alist."
465 (set-buffer nnweb-buffer
)
466 (let ((case-fold-search t
)
467 (active (or (cadr (assoc nnweb-group nnweb-group-alist
))
471 (nnheader-message 7 "Searching Gmane..." )
472 (when (funcall (nnweb-definition 'search
) nnweb-search
)
473 (goto-char (point-min))
474 ;; Skip the status line
476 ;; Thanks to Olly Betts we now have NOV lines in our buffer!
478 (unless (or (eolp) (looking-at "\x0d"))
479 (let ((header (nnheader-parse-nov)))
480 (let ((xref (mail-header-xref header
))
481 (from (mail-header-from header
))
482 (subject (mail-header-subject header
))
483 (rfc2047-encoding-type 'mime
))
484 (when (string-match " \\([^:]+\\)[:/]\\([0-9]+\\)" xref
)
485 (mail-header-set-xref
487 (format "http://article.gmane.org/%s/%s/raw"
488 (match-string 1 xref
)
489 (match-string 2 xref
))))
491 ;; Add host part to gmane-encrypted addresses
492 (when (string-match "@$" from
)
493 (mail-header-set-from header
494 (concat from
"public.gmane.org")))
496 (mail-header-set-subject header
497 (rfc2047-encode-string subject
))
499 (unless (nnweb-get-hashtb (mail-header-xref header
))
500 (mail-header-set-number header
(incf (cdr active
)))
501 (push (list (mail-header-number header
) header
) map
)
502 (nnweb-set-hashtb (cadar map
) (car map
))))))
504 (nnheader-message 7 "Searching Gmane...done")
506 (sort (nconc nnweb-articles map
) 'car-less-than-car
)))))
508 (defun nnweb-gmane-wash-article ()
509 (let ((case-fold-search t
))
510 (goto-char (point-min))
511 (when (search-forward "<!--X-Head-of-Message-->" nil t
)
512 (delete-region (point-min) (point))
513 (goto-char (point-min))
514 (while (looking-at "^<li><em>\\([^ ]+\\)</em>.*</li>")
515 (replace-match "\\1\\2" t
)
517 (mm-url-remove-markup))))
519 (defun nnweb-gmane-search (search)
522 (nnweb-definition 'address
)
524 (mm-url-encode-www-form-urlencoded
525 `(("query" .
,search
)
526 ("HITSPERPAGE" .
,(number-to-string nnweb-max-hits
))
527 ;;("TOPDOC" . "1000")
529 (setq buffer-file-name nil
)
530 (set-buffer-multibyte t
)
531 (mm-decode-coding-region (point-min) (point-max) 'utf-8
)
534 (defun nnweb-gmane-identity (url)
535 "Return a unique identifier based on URL."
536 (if (string-match "group=\\(.+\\)" url
)
541 ;;; General web/w3 interface utility functions
544 (defun nnweb-insert-html (parse)
545 "Insert HTML based on a w3 parse tree."
547 (insert (nnheader-string-as-multibyte parse
))
548 (insert "<" (symbol-name (car parse
)) " ")
551 (concat (symbol-name (car param
)) "="
553 (if (consp (cdr param
))
559 (mapc 'nnweb-insert-html
(nth 2 parse
))
560 (insert "</" (symbol-name (car parse
)) ">\n")))
562 (defun nnweb-parse-find (type parse
&optional maxdepth
)
563 "Find the element of TYPE in PARSE."
565 (nnweb-parse-find-1 type parse maxdepth
)))
567 (defun nnweb-parse-find-1 (type contents maxdepth
)
568 (when (or (null maxdepth
)
569 (not (zerop maxdepth
)))
570 (when (consp contents
)
571 (when (eq (car contents
) type
)
572 (throw 'found contents
))
573 (when (listp (cdr contents
))
574 (dolist (element contents
)
575 (when (consp element
)
576 (nnweb-parse-find-1 type element
577 (and maxdepth
(1- maxdepth
)))))))))
579 (defun nnweb-parse-find-all (type parse
)
580 "Find all elements of TYPE in PARSE."
582 (nnweb-parse-find-all-1 type parse
)))
584 (defun nnweb-parse-find-all-1 (type contents
)
586 (when (consp contents
)
587 (if (eq (car contents
) type
)
588 (push contents result
)
589 (when (listp (cdr contents
))
590 (dolist (element contents
)
591 (when (consp element
)
593 (nconc result
(nnweb-parse-find-all-1 type element
))))))))
597 (defun nnweb-text (parse)
598 "Return a list of text contents in PARSE."
599 (let ((nnweb-text nil
))
601 (nreverse nnweb-text
)))
603 (defun nnweb-text-1 (contents)
604 (dolist (element contents
)
605 (if (stringp element
)
606 (push element nnweb-text
)
607 (when (and (consp element
)
608 (listp (cdr element
)))
609 (nnweb-text-1 element
)))))
613 ;;; arch-tag: f59307eb-c90f-479f-b7d2-dbd8bf51b697
614 ;;; nnweb.el ends here