create sql module.
[biolisp.git] / cluster / clusters.lisp
blobe5e2d700d0e49d7a33a7673f2b4e81f47033c881
1 ;; We use cl-ppcre for text munging
2 (require 'cl-ppcre)
3 (require 'cl-pdf)
5 (defparameter *HELV* (pdf:get-font "Helvetica"))
6 (defparameter *HELV-SIZE* 12.0)
7 (defparameter *CLUST-HEIGHT* 20)
9 ;; Dummy function used in delete-if
10 (defun truep (x) (declare (ignore x)) t)
12 (defun readfile (filename)
13 (with-open-file (stream filename)
14 (let* ((whitespace '(#\Space #\Tab #\Newline #\Linefeed #\Return))
15 (rownames (make-array 0 :fill-pointer 0 :adjustable t))
16 (data (make-array 0 :fill-pointer 0 :adjustable t))
17 (colnames (map 'vector #'(lambda (x) x) (cdr (cl-ppcre:split #\Tab (string-trim whitespace (read-line stream nil nil))))))
19 (loop for line = (read-line stream nil nil)
20 while line do (progn
21 (setf p (cl-ppcre:split #\Tab (string-trim whitespace line)))
22 (vector-push-extend (car p) rownames)
23 (vector-push-extend (map 'vector #'(lambda (x) (float (parse-integer x))) (cdr p)) data)))
24 ;; subseq's lock down adjustable vectors into simple vectors
25 (list (subseq rownames 0) colnames (subseq data 0)))))
27 (defun pearson-distance (v1 v2)
28 (let* ((n (length v1))
29 (sum1 (reduce #'+ v1))
30 (sum2 (reduce #'+ v2))
31 (sum1-sq (reduce #'+ (map 'vector #'(lambda (x) (* x x)) v1)))
32 (sum2-sq (reduce #'+ (map 'vector #'(lambda (x) (* x x)) v2)))
33 (psum (reduce #'+ (map 'vector #'* v1 v2)))
34 (num (- psum (/ (* sum1 sum2) n)))
35 (den (sqrt (* (- sum1-sq (/ (* sum1 sum1) n)) (- sum2-sq (/ (* sum2 sum2) n))))))
36 (if (zerop den) 0 (- 1.0 (/ num den)))))
38 (defclass bicluster ()
39 ((vec :initarg :vec :accessor vec :initform nil)
40 (left :initarg :left :accessor left :initform nil)
41 (right :initarg :right :accessor right :initform nil)
42 (id :initarg :id :accessor id :initform nil)
43 (distance :initarg :distance :accessor distance :initform 0.0)))
46 (defun hcluster (rows &optional (distance 'pearson-distance))
47 (let* ((distances (make-hash-table :test 'equalp))
48 (current-clust-id -1)
49 (rowlen (length rows))
50 (clust (make-array rowlen :fill-pointer rowlen
51 :adjustable t
52 :initial-contents (loop for i upto (1- rowlen) collect (make-instance 'bicluster :vec (aref rows i) :id i)))))
53 (do* (lowestpair closest d mergevec newcluster)
54 ((<= (length clust) 1) (aref clust 0))
55 (setf lowestpair (cons 0 1))
56 (setf closest (funcall distance (vec (aref clust 0)) (vec (aref clust 1))))
57 (dotimes (i (length clust))
58 (do ((j (1+ i) (1+ j)))
59 ((>= j (length clust)))
60 (let* ((iid (id (aref clust i)))
61 (jid (id (aref clust j)))
62 (key (cons iid jid)))
63 (unless (gethash key distances)
64 (setf (gethash key distances) (funcall distance (vec (aref clust i)) (vec (aref clust j)))))
65 (setf d (gethash key distances))
66 (when (< d closest)
67 (setf closest d)
68 (setf lowestpair (cons i j))))))
69 (setf mergevec (map 'vector #'(lambda (x y) (/ (+ x y) 2.0))
70 (vec (aref clust (car lowestpair)))
71 (vec (aref clust (cdr lowestpair)))))
72 (setf newcluster (make-instance 'bicluster :vec mergevec
73 :left (aref clust (car lowestpair))
74 :right (aref clust (cdr lowestpair))
75 :distance closest
76 :id current-clust-id))
77 (decf current-clust-id)
78 (setf clust (delete-if #'truep clust :start (cdr lowestpair) :end (1+ (cdr lowestpair))))
79 (setf clust (delete-if #'truep clust :start (car lowestpair) :end (1+ (car lowestpair))))
80 (vector-push-extend newcluster clust))))
83 (defun printclust (clust &optional labels (n 0))
84 (let ((cid (id clust)))
85 (format t "~va" n #\Space)
86 (if (< cid 0)
87 (format t "-~%")
88 (if labels
89 (format t "~a~%" (aref labels cid))
90 (format t "~a~%" cid)))
91 (when (left clust)
92 (printclust (left clust) labels (1+ n)))
93 (when (right clust)
94 (printclust (right clust) labels (1+ n)))))
96 (defun getheight (clust)
97 (if (and (left clust) (right clust))
98 (+ (getheight (left clust)) (getheight (right clust)))
99 1))
101 (defun getdepth (clust)
102 (if (and (left clust) (right clust))
103 (max (getdepth (left clust)) (+ (getdepth (right clust)) (distance clust)))
104 0.0))
106 (defun draw-line (x1 y1 x2 y2)
107 (pdf:move-to x1 y1)
108 (pdf:line-to x2 y2)
109 (pdf:stroke))
111 (defun drawdendogram (clust labels file)
112 (let* ((h (* (getheight clust) *CLUST-HEIGHT*))
113 (w 1200)
114 (depth (getdepth clust))
115 (scaling (float (/ (- w 150) depth)))
116 ; Add to w for long blog names
117 (bounds (make-array 4 :initial-contents (list 0 0 (+ w 200) h))))
118 (pdf:with-document ()
119 (pdf:with-page (:bounds bounds)
120 (pdf:set-line-width 0.1)
122 (draw-line 0 (/ h 2) 10 (/ h 2))
124 (drawnode clust 10 (/ h 2) scaling labels))
125 (with-open-file (s file :direction :output :if-exists :supersede
126 ; :element-type '(unsigned-byte 8)
127 :element-type :default
128 :external-format :latin-1)
129 (pdf:write-document s)))))
131 (defun drawnode (clust x y scaling labels)
132 (if (< (id clust) 0)
133 (let* ((h1 (* (getheight (left clust)) *CLUST-HEIGHT*))
134 (h2 (* (getheight (right clust)) *CLUST-HEIGHT*))
135 (avg (/ (+ h1 h2) 2))
136 (top (- y avg))
137 (bottom (+ y avg))
138 (ll (* (distance clust) scaling)))
140 ; Vertical line form this cluster to children
141 (draw-line x (+ top (/ h1 2)) x (- bottom (/ h2 2)))
142 ; horizontal line to left item
143 (draw-line x (+ top (/ h1 2)) (+ x ll) (+ top (/ h1 2)))
144 ; horizontal line to right item
145 (draw-line x (- bottom (/ h2 2)) (+ x ll) (- bottom (/ h2 2)))
147 ; recurse
148 (drawnode (left clust) (+ x ll) (+ top (/ h1 2)) scaling labels)
149 (drawnode (right clust) (+ x ll) (- bottom (/ h2 2)) scaling labels))
150 (progn
151 (pdf:in-text-mode
152 (pdf:set-font *HELV* *HELV-SIZE*)
153 (pdf:move-text (+ x 5) (- y 7))
154 (pdf:draw-text (aref labels (id clust)))))))