1 ;;; nnweb.el --- retrieving articles via web search engines
3 ;; Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
4 ;; 2004, 2005 Free Software Foundation, Inc.
6 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
9 ;; This file is part of GNU Emacs.
11 ;; GNU Emacs is free software; you can redistribute it and/or modify
12 ;; it under the terms of the GNU General Public License as published by
13 ;; the Free Software Foundation; either version 2, or (at your option)
16 ;; GNU Emacs is distributed in the hope that it will be useful,
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;; GNU General Public License for more details.
21 ;; You should have received a copy of the GNU General Public License
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the
23 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
24 ;; Boston, MA 02110-1301, USA.
28 ;; Note: You need to have `w3' installed for some functions to work.
30 ;; FIXME: Due to changes in the HTML output of Google Groups and Gmane, stuff
31 ;; related to web groups (gnus-group-make-web-group) doesn't work anymore.
33 ;; Fetching an article by MID (cf. gnus-refer-article-method) over Google
34 ;; Groups should work.
38 (eval-when-compile (require 'cl
))
50 (autoload 'w3-parse-buffer
"w3-parse")
54 (defvoo nnweb-directory
(nnheader-concat gnus-directory
"nnweb/")
55 "Where nnweb will save its files.")
57 (defvoo nnweb-type
'google
58 "What search engine type is being used.
59 Valid types include `google', `dejanews', and `gmane'.")
61 (defvar nnweb-type-definition
63 (id .
"http://www.google.com/groups?as_umsgid=%s&hl=en&dmode=source")
64 (article . nnweb-google-wash-article
)
65 (reference . identity
)
66 (map . nnweb-google-create-mapping
)
67 (search . nnweb-google-search
)
68 (address .
"http://groups.google.com/groups")
69 (base .
"http://groups.google.com")
70 (identifier . nnweb-google-identity
))
71 (dejanews ;; alias of google
73 (id .
"http://groups.google.com/groups?selm=%s&output=gplain")
74 (reference . identity
)
75 (map . nnweb-google-create-mapping
)
76 (search . nnweb-google-search
)
77 (address .
"http://groups.google.com/groups")
78 (base .
"http://groups.google.com")
79 (identifier . nnweb-google-identity
))
81 (article . nnweb-gmane-wash-article
)
82 (id .
"http://gmane.org/view.php?group=%s")
83 (reference . identity
)
84 (map . nnweb-gmane-create-mapping
)
85 (search . nnweb-gmane-search
)
86 (address .
"http://gmane.org/")
87 (identifier . nnweb-gmane-identity
)))
88 "Type-definition alist.")
90 (defvoo nnweb-search nil
91 "Search string to feed to Google.")
93 (defvoo nnweb-max-hits
999
94 "Maximum number of hits to display.")
96 (defvoo nnweb-ephemeral-p nil
97 "Whether this nnweb server is ephemeral.")
99 ;;; Internal variables
101 (defvoo nnweb-articles nil
)
102 (defvoo nnweb-buffer nil
)
103 (defvoo nnweb-group-alist nil
)
104 (defvoo nnweb-group nil
)
105 (defvoo nnweb-hashtb nil
)
107 ;;; Interface functions
109 (nnoo-define-basics nnweb
)
111 (deffoo nnweb-retrieve-headers
(articles &optional group server fetch-old
)
112 (nnweb-possibly-change-server group server
)
114 (set-buffer nntp-server-buffer
)
116 (let (article header
)
117 (mm-with-unibyte-current-buffer
118 (while (setq article
(pop articles
))
119 (when (setq header
(cadr (assq article nnweb-articles
)))
120 (nnheader-insert-nov header
))))
123 (deffoo nnweb-request-scan
(&optional group server
)
124 (nnweb-possibly-change-server group server
)
125 (if nnweb-ephemeral-p
126 (setq nnweb-hashtb
(gnus-make-hashtable 4095)))
127 (funcall (nnweb-definition 'map
))
128 (unless nnweb-ephemeral-p
130 (nnweb-write-overview group
)))
132 (deffoo nnweb-request-group
(group &optional server dont-check
)
133 (nnweb-possibly-change-server nil server
)
135 (not (equal group nnweb-group
))
136 (not nnweb-ephemeral-p
))
137 (setq nnweb-group group
139 (let ((info (assoc group nnweb-group-alist
)))
141 (setq nnweb-type
(nth 2 info
))
142 (setq nnweb-search
(nth 3 info
))
144 (nnweb-read-overview group
)))))
146 ((not nnweb-articles
)
147 (nnheader-report 'nnweb
"No matching articles"))
149 (let ((active (if nnweb-ephemeral-p
150 (cons (caar nnweb-articles
)
151 (caar (last nnweb-articles
)))
152 (cadr (assoc group nnweb-group-alist
)))))
153 (nnheader-report 'nnweb
"Opened group %s" group
)
155 "211 %d %d %d %s\n" (length nnweb-articles
)
156 (car active
) (cdr active
) group
)))))
158 (deffoo nnweb-close-group
(group &optional server
)
159 (nnweb-possibly-change-server group server
)
160 (when (gnus-buffer-live-p nnweb-buffer
)
162 (set-buffer nnweb-buffer
)
163 (set-buffer-modified-p nil
)
164 (kill-buffer nnweb-buffer
)))
167 (deffoo nnweb-request-article
(article &optional group server buffer
)
168 (nnweb-possibly-change-server group server
)
170 (set-buffer (or buffer nntp-server-buffer
))
171 (let* ((header (cadr (assq article nnweb-articles
)))
172 (url (and header
(mail-header-xref header
))))
174 (mm-with-unibyte-current-buffer
175 (mm-url-insert url
)))
176 (and (stringp article
)
177 (nnweb-definition 'id t
)
178 (let ((fetch (nnweb-definition 'id
))
180 (when (string-match "^<\\(.*\\)>$" article
)
181 (setq art
(match-string 1 article
)))
182 (when (and fetch art
)
183 (setq url
(format fetch art
))
184 (mm-with-unibyte-current-buffer
186 (if (nnweb-definition 'reference t
)
188 (funcall (nnweb-definition
189 'reference
) article
)))))))
190 (unless nnheader-callback-function
191 (funcall (nnweb-definition 'article
)))
192 (nnheader-report 'nnweb
"Fetched article %s" article
)
193 (cons group
(and (numberp article
) article
))))))
195 (deffoo nnweb-close-server
(&optional server
)
196 (when (and (nnweb-server-opened server
)
197 (gnus-buffer-live-p nnweb-buffer
))
199 (set-buffer nnweb-buffer
)
200 (set-buffer-modified-p nil
)
201 (kill-buffer nnweb-buffer
)))
202 (nnoo-close-server 'nnweb server
))
204 (deffoo nnweb-request-list
(&optional server
)
205 (nnweb-possibly-change-server nil server
)
207 (set-buffer nntp-server-buffer
)
208 (nnmail-generate-active nnweb-group-alist
)
211 (deffoo nnweb-request-update-info
(group info
&optional server
)
212 (nnweb-possibly-change-server group server
))
214 (deffoo nnweb-asynchronous-p
()
217 (deffoo nnweb-request-create-group
(group &optional server args
)
218 (nnweb-possibly-change-server nil server
)
219 (nnweb-request-delete-group group
)
220 (push `(,group
,(cons 1 0) ,@args
) nnweb-group-alist
)
224 (deffoo nnweb-request-delete-group
(group &optional force server
)
225 (nnweb-possibly-change-server group server
)
226 (gnus-pull group nnweb-group-alist t
)
228 (gnus-delete-file (nnweb-overview-file group
))
231 (nnoo-define-skeleton nnweb
)
233 ;;; Internal functions
235 (defun nnweb-read-overview (group)
236 "Read the overview of GROUP and build the map."
237 (when (file-exists-p (nnweb-overview-file group
))
238 (mm-with-unibyte-buffer
239 (nnheader-insert-file-contents (nnweb-overview-file group
))
240 (goto-char (point-min))
243 (setq header
(nnheader-parse-nov))
245 (push (list (mail-header-number header
)
246 header
(mail-header-xref header
))
248 (nnweb-set-hashtb header
(car nnweb-articles
)))))))
250 (defun nnweb-write-overview (group)
251 "Write the overview file for GROUP."
252 (with-temp-file (nnweb-overview-file group
)
253 (let ((articles nnweb-articles
))
255 (nnheader-insert-nov (cadr (pop articles
)))))))
257 (defun nnweb-set-hashtb (header data
)
258 (gnus-sethash (nnweb-identifier (mail-header-xref header
))
261 (defun nnweb-get-hashtb (url)
262 (gnus-gethash (nnweb-identifier url
) nnweb-hashtb
))
264 (defun nnweb-identifier (ident)
265 (funcall (nnweb-definition 'identifier
) ident
))
267 (defun nnweb-overview-file (group)
268 "Return the name of the overview file of GROUP."
269 (nnheader-concat nnweb-directory group
".overview"))
271 (defun nnweb-write-active ()
272 "Save the active file."
273 (gnus-make-directory nnweb-directory
)
274 (with-temp-file (nnheader-concat nnweb-directory
"active")
275 (prin1 `(setq nnweb-group-alist
',nnweb-group-alist
) (current-buffer))))
277 (defun nnweb-read-active ()
278 "Read the active file."
279 (load (nnheader-concat nnweb-directory
"active") t t t
))
281 (defun nnweb-definition (type &optional noerror
)
282 "Return the definition of TYPE."
283 (let ((def (cdr (assq type
(assq nnweb-type nnweb-type-definition
)))))
286 (error "Undefined definition %s" type
))
289 (defun nnweb-possibly-change-server (&optional group server
)
292 (unless (nnweb-server-opened server
)
293 (nnweb-open-server server
)))
294 (unless nnweb-group-alist
297 (setq nnweb-hashtb
(gnus-make-hashtable 4095)))
299 (when (and (not nnweb-ephemeral-p
)
300 (equal group nnweb-group
))
301 (nnweb-request-group group nil t
))))
303 (defun nnweb-init (server)
304 "Initialize buffers and such."
305 (unless (gnus-buffer-live-p nnweb-buffer
)
309 (nnheader-set-temp-buffer
310 (format " *nnweb %s %s %s*"
311 nnweb-type nnweb-search server
))
312 (current-buffer))))))
315 ;;; groups.google.com
318 (defun nnweb-google-wash-article ()
319 ;; We have Google's masked e-mail addresses here. :-/
320 (let ((case-fold-search t
))
321 (goto-char (point-min))
323 (or (re-search-forward "The requested message.*could not be found."
325 (not (and (re-search-forward "^<pre>" nil t
)
326 (re-search-forward "^</pre>" nil t
)))))
327 ;; FIXME: Don't know how to indicate "not found".
328 ;; Should this function throw an error? --rsteib
330 (gnus-message 3 "Requested article not found")
332 (delete-region (point-min)
333 (1+ (re-search-forward "^<pre>" nil t
)))
334 (goto-char (point-min))
335 (delete-region (- (re-search-forward "^</pre>" nil t
) (length "</pre>"))
337 (mm-url-decode-entities))))
339 (defun nnweb-google-parse-1 (&optional Message-ID
)
342 (active (cadr (assoc nnweb-group nnweb-group-alist
)))
343 Subject Score Date Newsgroups From
346 (push (list nnweb-group
(setq active
(cons 1 0))
347 nnweb-type nnweb-search
)
349 ;; Go through all the article hits on this page.
350 (goto-char (point-min))
351 (while (re-search-forward
352 "a href=/groups\\(\\?[^ \">]*selm=\\([^ &\">]+\\)\\)" nil t
)
353 (setq mid
(match-string 2)
355 (nnweb-definition 'id
) mid
))
356 (narrow-to-region (search-forward ">" nil t
)
357 (search-forward "</a>" nil t
))
358 (mm-url-remove-markup)
359 (mm-url-decode-entities)
360 (setq Subject
(buffer-string))
361 (goto-char (point-max))
364 (when (looking-at "<br><font[^>]+>")
365 (goto-char (match-end 0)))
366 (if (not (looking-at "<a[^>]+>"))
367 (skip-chars-forward " \t")
368 (narrow-to-region (point)
369 (search-forward "</a>" nil t
))
370 (mm-url-remove-markup)
371 (mm-url-decode-entities)
372 (setq Newsgroups
(buffer-string))
373 (goto-char (point-max))
375 (skip-chars-forward "- \t"))
377 "\\([0-9]+\\)[/ ]\\([A-Za-z]+\\)[/ ]\\([0-9]+\\)[ \t]*by[ \t]*\\([^<]*\\) - <a")
378 (setq From
(match-string 4)
379 Date
(format "%s %s 00:00:00 %s"
380 (match-string 2) (match-string 1)
384 (unless (nnweb-get-hashtb url
)
388 (make-full-mail-header
389 (cdr active
) (if Newsgroups
390 (concat "(" Newsgroups
") " Subject
)
392 From Date
(or Message-ID mid
)
395 (nnweb-set-hashtb (cadar map
) (car map
))))
398 (defun nnweb-google-reference (id)
399 (let ((map (nnweb-google-parse-1 id
)) header
)
401 (nconc nnweb-articles map
))
402 (when (setq header
(cadar map
))
403 (mm-with-unibyte-current-buffer
404 (mm-url-insert (mail-header-xref header
)))
407 (defun nnweb-google-create-mapping ()
408 "Perform the search and create a number-to-url alist."
410 (set-buffer nnweb-buffer
)
412 (when (funcall (nnweb-definition 'search
) nnweb-search
)
417 (nconc nnweb-articles
(nnweb-google-parse-1)))
418 ;; Check if there are more articles to fetch
419 (goto-char (point-min))
421 (if (or (not (re-search-forward
422 "<td nowrap><a href=\\([^>]+\\).*<span class=b>Next</span>" nil t
))
423 (>= i nnweb-max-hits
))
425 ;; Yup, there are more articles
426 (setq more
(concat (nnweb-definition 'base
) (match-string 1)))
429 (mm-url-insert more
))))
430 ;; Return the articles in the right order.
432 (sort nnweb-articles
'car-less-than-car
))))))
434 (defun nnweb-google-search (search)
437 (nnweb-definition 'address
)
439 (mm-url-encode-www-form-urlencoded
446 ("sites" .
"groups")))))
449 (defun nnweb-google-identity (url)
450 "Return an unique identifier based on URL."
451 (if (string-match "selm=\\([^ &>]+\\)" url
)
458 (defun nnweb-gmane-create-mapping ()
459 "Perform the search and create a number-to-url alist."
461 (set-buffer nnweb-buffer
)
463 (when (funcall (nnweb-definition 'search
) nnweb-search
)
466 (active (or (cadr (assoc nnweb-group nnweb-group-alist
))
470 ;; Remove stuff from the beginning of results
471 (goto-char (point-min))
472 (search-forward "Search Results</h1><ul>" nil t
)
473 (delete-region (point-min) (point))
474 (goto-char (point-min))
475 ;; Iterate over the actual hits
476 (while (re-search-forward ".*href=\"\\([^\"]+\\)\">\\(.*\\)" nil t
)
477 (setq url
(concat "http://gmane.org/" (match-string 1)))
478 (setq subject
(match-string 2))
479 (unless (nnweb-get-hashtb url
)
483 (make-full-mail-header
484 (cdr active
) (concat "(" group
") " subject
) nil nil
487 (nnweb-set-hashtb (cadar map
) (car map
))))
488 ;; Return the articles in the right order.
490 (sort (nconc nnweb-articles map
) 'car-less-than-car
))))))
492 (defun nnweb-gmane-wash-article ()
493 (let ((case-fold-search t
))
494 (goto-char (point-min))
495 (re-search-forward "<!--X-Head-of-Message-->" nil t
)
496 (delete-region (point-min) (point))
497 (goto-char (point-min))
498 (while (looking-at "^<li><em>\\([^ ]+\\)</em>.*</li>")
499 (replace-match "\\1\\2" t
)
501 (mm-url-remove-markup)))
503 (defun nnweb-gmane-search (search)
506 (nnweb-definition 'address
)
508 (mm-url-encode-www-form-urlencoded
509 `(("query" .
,search
)))))
510 (setq buffer-file-name nil
)
514 (defun nnweb-gmane-identity (url)
515 "Return a unique identifier based on URL."
516 (if (string-match "group=\\(.+\\)" url
)
521 ;;; General web/w3 interface utility functions
524 (defun nnweb-insert-html (parse)
525 "Insert HTML based on a w3 parse tree."
527 (insert (nnheader-string-as-multibyte parse
))
528 (insert "<" (symbol-name (car parse
)) " ")
531 (concat (symbol-name (car param
)) "="
533 (if (consp (cdr param
))
539 (mapcar 'nnweb-insert-html
(nth 2 parse
))
540 (insert "</" (symbol-name (car parse
)) ">\n")))
542 (defun nnweb-parse-find (type parse
&optional maxdepth
)
543 "Find the element of TYPE in PARSE."
545 (nnweb-parse-find-1 type parse maxdepth
)))
547 (defun nnweb-parse-find-1 (type contents maxdepth
)
548 (when (or (null maxdepth
)
549 (not (zerop maxdepth
)))
550 (when (consp contents
)
551 (when (eq (car contents
) type
)
552 (throw 'found contents
))
553 (when (listp (cdr contents
))
554 (dolist (element contents
)
555 (when (consp element
)
556 (nnweb-parse-find-1 type element
557 (and maxdepth
(1- maxdepth
)))))))))
559 (defun nnweb-parse-find-all (type parse
)
560 "Find all elements of TYPE in PARSE."
562 (nnweb-parse-find-all-1 type parse
)))
564 (defun nnweb-parse-find-all-1 (type contents
)
566 (when (consp contents
)
567 (if (eq (car contents
) type
)
568 (push contents result
)
569 (when (listp (cdr contents
))
570 (dolist (element contents
)
571 (when (consp element
)
573 (nconc result
(nnweb-parse-find-all-1 type element
))))))))
577 (defun nnweb-text (parse)
578 "Return a list of text contents in PARSE."
579 (let ((nnweb-text nil
))
581 (nreverse nnweb-text
)))
583 (defun nnweb-text-1 (contents)
584 (dolist (element contents
)
585 (if (stringp element
)
586 (push element nnweb-text
)
587 (when (and (consp element
)
588 (listp (cdr element
)))
589 (nnweb-text-1 element
)))))
593 ;;; arch-tag: f59307eb-c90f-479f-b7d2-dbd8bf51b697
594 ;;; nnweb.el ends here