1 ;;; nnweb.el --- retrieving articles via web search engines
2 ;; Copyright (C) 1996,97,98 Free Software Foundation, Inc.
4 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
7 ;; This file is part of GNU Emacs.
9 ;; GNU Emacs is free software; you can redistribute it and/or modify
10 ;; it under the terms of the GNU General Public License as published by
11 ;; the Free Software Foundation; either version 2, or (at your option)
14 ;; GNU Emacs is distributed in the hope that it will be useful,
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;; GNU General Public License for more details.
19 ;; You should have received a copy of the GNU General Public License
20 ;; along with GNU Emacs; see the file COPYING. If not, write to the
21 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 ;; Boston, MA 02111-1307, USA.
26 ;; Note: You need to have `url' and `w3' installed for this
31 (eval-when-compile (require 'cl
))
33 (eval-when-compile (require 'cl
))
45 ;; Report failure to find w3 at load time if appropriate.
53 (defvoo nnweb-directory
(nnheader-concat gnus-directory
"nnweb/")
54 "Where nnweb will save its files.")
56 (defvoo nnweb-type
'dejanews
57 "What search engine type is being used.
58 Valid types include `dejanews', `dejanewsold', `reference',
61 (defvoo nnweb-type-definition
63 (article . nnweb-dejanews-wash-article
)
64 (map . nnweb-dejanews-create-mapping
)
65 (search . nnweb-dejanews-search
)
66 (address .
"http://x8.dejanews.com/dnquery.xp")
67 (identifier . nnweb-dejanews-identity
))
69 (article . nnweb-dejanews-wash-article
)
70 (map . nnweb-dejanews-create-mapping
)
71 (search . nnweb-dejanewsold-search
)
72 (address .
"http://x8.dejanews.com/dnquery.xp")
73 (identifier . nnweb-dejanews-identity
))
75 (article . nnweb-reference-wash-article
)
76 (map . nnweb-reference-create-mapping
)
77 (search . nnweb-reference-search
)
78 (address .
"http://www.reference.com/cgi-bin/pn/go")
79 (identifier . identity
))
81 (article . nnweb-altavista-wash-article
)
82 (map . nnweb-altavista-create-mapping
)
83 (search . nnweb-altavista-search
)
84 (address .
"http://www.altavista.digital.com/cgi-bin/query")
85 (id .
"/cgi-bin/news?id@%s")
86 (identifier . identity
)))
87 "Type-definition alist.")
89 (defvoo nnweb-search nil
90 "Search string to feed to DejaNews.")
92 (defvoo nnweb-max-hits
999
93 "Maximum number of hits to display.")
95 (defvoo nnweb-ephemeral-p nil
96 "Whether this nnweb server is ephemeral.")
98 ;;; Internal variables
100 (defvoo nnweb-articles nil
)
101 (defvoo nnweb-buffer nil
)
102 (defvoo nnweb-group-alist nil
)
103 (defvoo nnweb-group nil
)
104 (defvoo nnweb-hashtb nil
)
106 ;;; Interface functions
108 (nnoo-define-basics nnweb
)
110 (deffoo nnweb-retrieve-headers
(articles &optional group server fetch-old
)
111 (nnweb-possibly-change-server group server
)
113 (set-buffer nntp-server-buffer
)
115 (let (article header
)
116 (while (setq article
(pop articles
))
117 (when (setq header
(cadr (assq article nnweb-articles
)))
118 (nnheader-insert-nov header
)))
121 (deffoo nnweb-request-scan
(&optional group server
)
122 (nnweb-possibly-change-server group server
)
123 (setq nnweb-hashtb
(gnus-make-hashtable 4095))
124 (funcall (nnweb-definition 'map
))
125 (unless nnweb-ephemeral-p
127 (nnweb-write-overview group
)))
129 (deffoo nnweb-request-group
(group &optional server dont-check
)
130 (nnweb-possibly-change-server nil server
)
132 (not (equal group nnweb-group
))
133 (not nnweb-ephemeral-p
))
134 (let ((info (assoc group nnweb-group-alist
)))
135 (setq nnweb-group group
)
136 (setq nnweb-type
(nth 2 info
))
137 (setq nnweb-search
(nth 3 info
))
139 (nnweb-read-overview group
))))
141 ((not nnweb-articles
)
142 (nnheader-report 'nnweb
"No matching articles"))
144 (let ((active (if nnweb-ephemeral-p
145 (cons (caar nnweb-articles
)
146 (caar (last nnweb-articles
)))
147 (cadr (assoc group nnweb-group-alist
)))))
148 (nnheader-report 'nnweb
"Opened group %s" group
)
150 "211 %d %d %d %s\n" (length nnweb-articles
)
151 (car active
) (cdr active
) group
)))))
153 (deffoo nnweb-close-group
(group &optional server
)
154 (nnweb-possibly-change-server group server
)
155 (when (gnus-buffer-live-p nnweb-buffer
)
157 (set-buffer nnweb-buffer
)
158 (set-buffer-modified-p nil
)
159 (kill-buffer nnweb-buffer
)))
162 (deffoo nnweb-request-article
(article &optional group server buffer
)
163 (nnweb-possibly-change-server group server
)
165 (set-buffer (or buffer nntp-server-buffer
))
166 (let* ((header (cadr (assq article nnweb-articles
)))
167 (url (and header
(mail-header-xref header
))))
169 (nnweb-fetch-url url
))
170 (and (stringp article
)
171 (nnweb-definition 'id t
)
172 (let ((fetch (nnweb-definition 'id
))
174 (when (string-match "^<\\(.*\\)>$" article
)
175 (setq art
(match-string 1 article
)))
179 (format fetch article
))))))
180 (unless nnheader-callback-function
181 (funcall (nnweb-definition 'article
))
182 (nnweb-decode-entities))
183 (nnheader-report 'nnweb
"Fetched article %s" article
)
186 (deffoo nnweb-close-server
(&optional server
)
187 (when (and (nnweb-server-opened server
)
188 (gnus-buffer-live-p nnweb-buffer
))
190 (set-buffer nnweb-buffer
)
191 (set-buffer-modified-p nil
)
192 (kill-buffer nnweb-buffer
)))
193 (nnoo-close-server 'nnweb server
))
195 (deffoo nnweb-request-list
(&optional server
)
196 (nnweb-possibly-change-server nil server
)
198 (set-buffer nntp-server-buffer
)
199 (nnmail-generate-active nnweb-group-alist
)
202 (deffoo nnweb-request-update-info
(group info
&optional server
)
203 (nnweb-possibly-change-server group server
)
204 ;;(setcar (cddr info) nil)
207 (deffoo nnweb-asynchronous-p
()
210 (deffoo nnweb-request-create-group
(group &optional server args
)
211 (nnweb-possibly-change-server nil server
)
212 (nnweb-request-delete-group group
)
213 (push `(,group
,(cons 1 0) ,@args
) nnweb-group-alist
)
217 (deffoo nnweb-request-delete-group
(group &optional force server
)
218 (nnweb-possibly-change-server group server
)
219 (gnus-pull group nnweb-group-alist
)
220 (gnus-delete-file (nnweb-overview-file group
))
223 (nnoo-define-skeleton nnweb
)
225 ;;; Internal functions
227 (defun nnweb-read-overview (group)
228 "Read the overview of GROUP and build the map."
229 (when (file-exists-p (nnweb-overview-file group
))
230 (nnheader-temp-write nil
231 (nnheader-insert-file-contents (nnweb-overview-file group
))
232 (goto-char (point-min))
235 (setq header
(nnheader-parse-nov))
237 (push (list (mail-header-number header
)
238 header
(mail-header-xref header
))
240 (nnweb-set-hashtb header
(car nnweb-articles
)))))))
242 (defun nnweb-write-overview (group)
243 "Write the overview file for GROUP."
244 (nnheader-temp-write (nnweb-overview-file group
)
245 (let ((articles nnweb-articles
))
247 (nnheader-insert-nov (cadr (pop articles
)))))))
249 (defun nnweb-set-hashtb (header data
)
250 (gnus-sethash (nnweb-identifier (mail-header-xref header
))
253 (defun nnweb-get-hashtb (url)
254 (gnus-gethash (nnweb-identifier url
) nnweb-hashtb
))
256 (defun nnweb-identifier (ident)
257 (funcall (nnweb-definition 'identifier
) ident
))
259 (defun nnweb-overview-file (group)
260 "Return the name of the overview file of GROUP."
261 (nnheader-concat nnweb-directory group
".overview"))
263 (defun nnweb-write-active ()
264 "Save the active file."
265 (nnheader-temp-write (nnheader-concat nnweb-directory
"active")
266 (prin1 `(setq nnweb-group-alist
',nnweb-group-alist
) (current-buffer))))
268 (defun nnweb-read-active ()
269 "Read the active file."
270 (load (nnheader-concat nnweb-directory
"active") t t t
))
272 (defun nnweb-definition (type &optional noerror
)
273 "Return the definition of TYPE."
274 (let ((def (cdr (assq type
(assq nnweb-type nnweb-type-definition
)))))
277 (error "Undefined definition %s" type
))
280 (defun nnweb-possibly-change-server (&optional group server
)
283 (unless (nnweb-server-opened server
)
284 (nnweb-open-server server
)))
285 (unless nnweb-group-alist
288 (when (and (not nnweb-ephemeral-p
)
289 (not (equal group nnweb-group
)))
290 (nnweb-request-group group nil t
))))
292 (defun nnweb-init (server)
293 "Initialize buffers and such."
294 (unless (gnus-buffer-live-p nnweb-buffer
)
297 (nnheader-set-temp-buffer
298 (format " *nnweb %s %s %s*" nnweb-type nnweb-search server
))))))
300 (defun nnweb-fetch-url (url)
302 (if (not nnheader-callback-function
)
303 (let ((buf (current-buffer)))
305 (set-buffer nnweb-buffer
)
307 (url-insert-file-contents url
)
308 (copy-to-buffer buf
(point-min) (point-max))
310 (nnweb-url-retrieve-asynch
311 url
'nnweb-callback
(current-buffer) nnheader-callback-function
)
314 (defun nnweb-callback (buffer callback
)
315 (when (gnus-buffer-live-p url-working-buffer
)
317 (set-buffer url-working-buffer
)
318 (funcall (nnweb-definition 'article
))
319 (nnweb-decode-entities)
321 (goto-char (point-max))
322 (insert-buffer-substring url-working-buffer
))
324 (gnus-kill-buffer url-working-buffer
)))
326 (defun nnweb-url-retrieve-asynch (url callback
&rest data
)
327 (let ((url-request-method "GET")
328 (old-asynch url-be-asynchronous
)
329 (url-request-data nil
)
330 (url-request-extra-headers nil
)
331 (url-working-buffer (generate-new-buffer-name " *nnweb*")))
332 (setq-default url-be-asynchronous t
)
334 (set-buffer (get-buffer-create url-working-buffer
))
335 (setq url-current-callback-data data
336 url-be-asynchronous t
337 url-current-callback-func callback
)
339 (setq-default url-be-asynchronous old-asynch
)))
341 (defun nnweb-encode-www-form-urlencoded (pairs)
342 "Return PAIRS encoded for forms."
346 (concat (w3-form-encode-xwfu (car data
)) "="
347 (w3-form-encode-xwfu (cdr data
)))))
350 (defun nnweb-fetch-form (url pairs
)
351 (let ((url-request-data (nnweb-encode-www-form-urlencoded pairs
))
352 (url-request-method "POST")
353 (url-request-extra-headers
354 '(("Content-type" .
"application/x-www-form-urlencoded"))))
355 (url-insert-file-contents url
)
356 (setq buffer-file-name nil
))
359 (defun nnweb-decode-entities ()
360 (goto-char (point-min))
361 (while (re-search-forward "&\\([a-z]+\\);" nil t
)
362 (replace-match (char-to-string (or (cdr (assq (intern (match-string 1))
367 (defun nnweb-remove-markup ()
368 (goto-char (point-min))
369 (while (search-forward "<!--" nil t
)
370 (delete-region (match-beginning 0)
371 (or (search-forward "-->" nil t
)
373 (goto-char (point-min))
374 (while (re-search-forward "<[^>]+>" nil t
)
375 (replace-match "" t t
)))
378 ;;; DejaNews functions.
381 (defun nnweb-dejanews-create-mapping ()
382 "Perform the search and create an number-to-url alist."
384 (set-buffer nnweb-buffer
)
386 (when (funcall (nnweb-definition 'search
) nnweb-search
)
390 (active (or (cadr (assoc nnweb-group nnweb-group-alist
))
392 Subject
(Score "0") Date Newsgroup Author
395 ;; Go through all the article hits on this page.
396 (goto-char (point-min))
397 (nnweb-decode-entities)
398 (goto-char (point-min))
399 (while (re-search-forward "^ <P>\n" nil t
)
402 (cond ((re-search-forward "^ <P>\n" nil t
)
404 ((search-forward "\n\n" nil t
)
408 (goto-char (point-min))
409 (looking-at ".*HREF=\"\\([^\"]+\\)\"\\(.*\\)")
410 (setq url
(match-string 1))
411 (let ((begin (point)))
412 (nnweb-remove-markup)
414 (while (search-forward "\t" nil t
)
418 (setq Subject
(buffer-substring begin
(point)))
419 (if (re-search-forward
420 "^ Newsgroup: \\(.*\\)\n Posted on \\([0-9/]+\\) by \\(.*\\)$" nil t
)
421 (setq Newsgroup
(match-string 1)
422 Date
(match-string 2)
423 Author
(match-string 3))))
426 (unless (nnweb-get-hashtb url
)
430 (make-full-mail-header
431 (cdr active
) Subject Author Date
432 (concat "<" (nnweb-identifier url
) "@dejanews>")
433 nil
0 (string-to-int Score
) url
))
435 (nnweb-set-hashtb (cadar map
) (car map
))))
436 ;; See whether there is a "Get next 20 hits" button here.
437 (if (or (not (re-search-forward
438 "HREF=\"\\([^\"]+\\)\"[<>b]+Next result" nil t
))
439 (>= i nnweb-max-hits
))
442 (setq more
(match-string 1))
444 (url-insert-file-contents more
)))
445 ;; Return the articles in the right order.
447 (sort (nconc nnweb-articles map
) 'car-less-than-car
))))))
449 (defun nnweb-dejanews-wash-article ()
450 (let ((case-fold-search t
))
451 (goto-char (point-min))
452 (re-search-forward "<PRE>" nil t
)
453 (delete-region (point-min) (point))
454 (re-search-forward "</PRE>" nil t
)
455 (delete-region (point) (point-max))
456 (nnweb-remove-markup)
457 (goto-char (point-min))
458 (while (and (looking-at " *$")
461 (while (looking-at "\\(^[^ ]+:\\) *")
462 (replace-match "\\1 " t
)
464 (when (re-search-forward "\n\n+" nil t
)
465 (replace-match "\n" t t
))
466 (goto-char (point-min))
467 (when (search-forward "[More Headers]" nil t
)
468 (replace-match "" t t
))))
470 (defun nnweb-dejanews-search (search)
472 (nnweb-definition 'address
)
473 `(("query" .
,search
)
474 ("defaultOp" .
"AND")
475 ("svcclass" .
"dncurrent")
477 ("format" .
"verbose2")
479 ("showsort" .
"date")
481 ("ageweight" .
"1")))
484 (defun nnweb-dejanewsold-search (search)
486 (nnweb-definition 'address
)
487 `(("query" .
,search
)
488 ("defaultOp" .
"AND")
489 ("svcclass" .
"dnold")
491 ("format" .
"verbose2")
493 ("showsort" .
"date")
495 ("ageweight" .
"1")))
498 (defun nnweb-dejanews-identity (url)
499 "Return an unique identifier based on URL."
500 (if (string-match "recnum=\\([0-9]+\\)" url
)
508 (defun nnweb-reference-create-mapping ()
509 "Perform the search and create an number-to-url alist."
511 (set-buffer nnweb-buffer
)
513 (when (funcall (nnweb-definition 'search
) nnweb-search
)
517 (active (or (cadr (assoc nnweb-group nnweb-group-alist
))
519 Subject Score Date Newsgroups From Message-ID
522 ;; Go through all the article hits on this page.
523 (goto-char (point-min))
524 (search-forward "</pre><hr>" nil t
)
525 (delete-region (point-min) (point))
526 ;(nnweb-decode-entities)
527 (goto-char (point-min))
528 (while (re-search-forward "^ +[0-9]+\\." nil t
)
531 (if (re-search-forward "^$" nil t
)
534 (goto-char (point-min))
535 (when (looking-at ".*href=\"\\([^\"]+\\)\"")
536 (setq url
(match-string 1)))
537 (nnweb-remove-markup)
538 (goto-char (point-min))
539 (while (search-forward "\t" nil t
)
541 (goto-char (point-min))
542 (while (re-search-forward "^\\([^:]+\\): \\(.*\\)$" nil t
)
543 (set (intern (match-string 1)) (match-string 2)))
545 (search-forward "</pre>" nil t
)
547 (unless (nnweb-get-hashtb url
)
551 (make-full-mail-header
552 (cdr active
) (concat "(" Newsgroups
") " Subject
) From Date
554 nil
0 (string-to-int Score
) url
))
556 (nnweb-set-hashtb (cadar map
) (car map
))))
558 ;; Return the articles in the right order.
560 (sort (nconc nnweb-articles map
) 'car-less-than-car
))))))
562 (defun nnweb-reference-wash-article ()
563 (let ((case-fold-search t
))
564 (goto-char (point-min))
565 (re-search-forward "^</center><hr>" nil t
)
566 (delete-region (point-min) (point))
567 (search-forward "<pre>" nil t
)
569 (let ((body (point-marker)))
570 (search-forward "</pre>" nil t
)
571 (delete-region (point) (point-max))
572 (nnweb-remove-markup)
573 (goto-char (point-min))
574 (while (looking-at " *$")
576 (narrow-to-region (point-min) body
)
577 (while (and (re-search-forward "^$" nil t
)
580 (goto-char (point-min))
581 (while (looking-at "\\(^[^ ]+:\\) *")
582 (replace-match "\\1 " t
)
584 (goto-char (point-min))
585 (when (re-search-forward "^References:" nil t
)
587 (point) (if (re-search-forward "^$\\|^[^:]+:" nil t
)
590 (goto-char (point-min))
592 (unless (looking-at "References")
595 (goto-char (point-min))
596 (while (search-forward "," nil t
)
597 (replace-match " " t t
)))
599 (set-marker body nil
))))
601 (defun nnweb-reference-search (search)
602 (url-insert-file-contents
604 (nnweb-definition 'address
)
606 (nnweb-encode-www-form-urlencoded
607 `(("search" .
"advanced")
608 ("querytext" .
,search
)
613 ("organization" .
"")
616 ("choice" .
"Search")
617 ("startmonth" .
"Jul")
619 ("startyear" .
"1996")
624 ("verbosity" .
"Verbose")
625 ("ranking" .
"Relevance")
629 (setq buffer-file-name nil
)
636 (defun nnweb-altavista-create-mapping ()
637 "Perform the search and create an number-to-url alist."
639 (set-buffer nnweb-buffer
)
642 (when (funcall (nnweb-definition 'search
) nnweb-search part
)
646 (active (or (cadr (assoc nnweb-group nnweb-group-alist
))
648 subject date from id group
651 ;; Go through all the article hits on this page.
652 (goto-char (point-min))
653 (search-forward "<dt>" nil t
)
654 (delete-region (point-min) (match-beginning 0))
655 (goto-char (point-min))
656 (while (search-forward "<dt>" nil t
)
657 (replace-match "\n<blubb>"))
658 (nnweb-decode-entities)
659 (goto-char (point-min))
660 (while (re-search-forward "<blubb>.*href=\"\\([^\"]+\\)\"><strong>\\([^>]*\\)</strong></a><dd>\\([^-]+\\)- <b>\\([^<]+\\)<.*href=\"news:\\([^\"]+\\)\">.*\">\\(.+\\)</a><P>"
662 (setq url
(match-string 1)
663 subject
(match-string 2)
664 date
(match-string 3)
665 group
(match-string 4)
666 id
(concat "<" (match-string 5) ">")
667 from
(match-string 6))
669 (unless (nnweb-get-hashtb url
)
673 (make-full-mail-header
674 (cdr active
) (concat "(" group
") " subject
) from date
677 (nnweb-set-hashtb (cadar map
) (car map
))))
678 ;; See if we want more.
679 (when (or (not nnweb-articles
)
680 (>= i nnweb-max-hits
)
681 (not (funcall (nnweb-definition 'search
)
682 nnweb-search
(incf part
))))
684 ;; Return the articles in the right order.
686 (sort (nconc nnweb-articles map
) 'car-less-than-car
)))))))
688 (defun nnweb-altavista-wash-article ()
689 (goto-char (point-min))
690 (let ((case-fold-search t
))
691 (when (re-search-forward "^<strong>" nil t
)
692 (delete-region (point-min) (match-beginning 0)))
693 (goto-char (point-min))
694 (while (looking-at "<strong>\\([^ ]+\\) +</strong> +\\(.*\\)$")
695 (replace-match "\\1: \\2" t
)
697 (when (re-search-backward "^References:" nil t
)
698 (narrow-to-region (point) (progn (forward-line 1) (point)))
699 (goto-char (point-min))
700 (while (re-search-forward "<A.*\\?id@\\([^\"]+\\)\">[0-9]+</A>" nil t
)
701 (replace-match "<\\1> " t
)))
703 (nnweb-remove-markup)))
705 (defun nnweb-altavista-search (search &optional part
)
706 (url-insert-file-contents
708 (nnweb-definition 'address
)
710 (nnweb-encode-www-form-urlencoded
713 ,@(when part
`(("stq" .
,(int-to-string (* part
30)))))
719 (setq buffer-file-name nil
)
724 ;;; nnweb.el ends here