3 * \brief Frequent/Closed itemset discovery routines
5 * This file contains the code for finding frequent/closed itemests. These routines
6 * are implemented using a call-back mechanism to deal with the discovered itemsets.
9 * \author George Karypis
10 * \version\verbatim $Id: itemsets.c 11075 2011-11-11 22:31:52Z karypis $ \endverbatim
15 /*-------------------------------------------------------------*/
16 /*! Data structures for use within this module */
17 /*-------------------------------------------------------------*/
19 int minfreq
; /* the minimum frequency of a pattern */
20 int maxfreq
; /* the maximum frequency of a pattern */
21 int minlen
; /* the minimum length of the requested pattern */
22 int maxlen
; /* the maximum length of the requested pattern */
23 int tnitems
; /* the initial range of the item space */
25 /* the call-back function */
26 void (*callback
)(void *stateptr
, int nitems
, int *itemids
, int ntrans
, int *transids
);
27 void *stateptr
; /* the user-supplied pointer to pass to the callback */
29 /* workspace variables */
35 /*-------------------------------------------------------------*/
36 /*! Prototypes for this module */
37 /*-------------------------------------------------------------*/
38 void itemsets_find_frequent_itemsets(isparams_t
*params
, gk_csr_t
*mat
,
39 int preflen
, int *prefix
);
40 gk_csr_t
*itemsets_project_matrix(isparams_t
*param
, gk_csr_t
*mat
, int cid
);
44 /*************************************************************************/
45 /*! The entry point of the frequent itemset discovery code */
46 /*************************************************************************/
47 void gk_find_frequent_itemsets(int ntrans
, ssize_t
*tranptr
, int *tranind
,
48 int minfreq
, int maxfreq
, int minlen
, int maxlen
,
49 void (*process_itemset
)(void *stateptr
, int nitems
, int *itemids
,
50 int ntrans
, int *transids
),
58 /* Create the matrix */
59 mat
= gk_csr_Create();
61 mat
->ncols
= tranind
[gk_iargmax(tranptr
[ntrans
], tranind
)]+1;
62 mat
->rowptr
= gk_zcopy(ntrans
+1, tranptr
, gk_zmalloc(ntrans
+1, "gk_find_frequent_itemsets: mat.rowptr"));
63 mat
->rowind
= gk_icopy(tranptr
[ntrans
], tranind
, gk_imalloc(tranptr
[ntrans
], "gk_find_frequent_itemsets: mat.rowind"));
64 mat
->colids
= gk_iincset(mat
->ncols
, 0, gk_imalloc(mat
->ncols
, "gk_find_frequent_itemsets: mat.colids"));
66 /* Setup the parameters */
67 params
.minfreq
= minfreq
;
68 params
.maxfreq
= (maxfreq
== -1 ? mat
->nrows
: maxfreq
);
69 params
.minlen
= minlen
;
70 params
.maxlen
= (maxlen
== -1 ? mat
->ncols
: maxlen
);
71 params
.tnitems
= mat
->ncols
;
72 params
.callback
= process_itemset
;
73 params
.stateptr
= stateptr
;
74 params
.rmarker
= gk_ismalloc(mat
->nrows
, 0, "gk_find_frequent_itemsets: rmarker");
75 params
.cand
= gk_ikvmalloc(mat
->ncols
, "gk_find_frequent_itemsets: cand");
77 /* Perform the initial projection */
78 gk_csr_CreateIndex(mat
, GK_CSR_COL
);
79 pmat
= itemsets_project_matrix(¶ms
, mat
, -1);
82 pattern
= gk_imalloc(pmat
->ncols
, "gk_find_frequent_itemsets: pattern");
83 itemsets_find_frequent_itemsets(¶ms
, pmat
, 0, pattern
);
86 gk_free((void **)&pattern
, ¶ms
.rmarker
, ¶ms
.cand
, LTERM
);
92 /*************************************************************************/
93 /*! The recursive routine for DFS-based frequent pattern discovery */
94 /*************************************************************************/
95 void itemsets_find_frequent_itemsets(isparams_t
*params
, gk_csr_t
*mat
,
96 int preflen
, int *prefix
)
101 /* Project each frequent column */
102 for (i
=0; i
<mat
->ncols
; i
++) {
103 prefix
[preflen
] = mat
->colids
[i
];
105 if (preflen
+1 >= params
->minlen
)
106 (*params
->callback
)(params
->stateptr
, preflen
+1, prefix
,
107 mat
->colptr
[i
+1]-mat
->colptr
[i
], mat
->colind
+mat
->colptr
[i
]);
109 if (preflen
+1 < params
->maxlen
) {
110 cmat
= itemsets_project_matrix(params
, mat
, i
);
111 itemsets_find_frequent_itemsets(params
, cmat
, preflen
+1, prefix
);
119 /******************************************************************************/
120 /*! This function projects a matrix w.r.t. to a particular column.
121 It performs the following steps:
122 - Determines the length of each column that is remaining
123 - Sorts the columns in increasing length
124 - Creates a column-based version of the matrix with the proper
125 column ordering and renamed rowids.
127 /*******************************************************************************/
128 gk_csr_t
*itemsets_project_matrix(isparams_t
*params
, gk_csr_t
*mat
, int cid
)
130 ssize_t i
, j
, k
, ii
, pnnz
;
131 int nrows
, ncols
, pnrows
, pncols
;
132 ssize_t
*colptr
, *pcolptr
;
133 int *colind
, *colids
, *pcolind
, *pcolids
, *rmarker
;
139 colptr
= mat
->colptr
;
140 colind
= mat
->colind
;
141 colids
= mat
->colids
;
143 rmarker
= params
->rmarker
;
147 /* Allocate space for the projected matrix based on what you know thus far */
148 pmat
= gk_csr_Create();
149 pmat
->nrows
= pnrows
= (cid
== -1 ? nrows
: colptr
[cid
+1]-colptr
[cid
]);
152 /* Mark the rows that will be kept and determine the prowids */
153 if (cid
== -1) { /* Initial projection */
154 gk_iset(nrows
, 1, rmarker
);
156 else { /* The other projections */
157 for (i
=colptr
[cid
]; i
<colptr
[cid
+1]; i
++)
158 rmarker
[colind
[i
]] = 1;
162 /* Determine the length of each column that will be left in the projected matrix */
163 for (pncols
=0, pnnz
=0, i
=cid
+1; i
<ncols
; i
++) {
164 for (k
=0, j
=colptr
[i
]; j
<colptr
[i
+1]; j
++) {
165 k
+= rmarker
[colind
[j
]];
167 if (k
>= params
->minfreq
&& k
<= params
->maxfreq
) {
168 cand
[pncols
].val
= i
;
169 cand
[pncols
++].key
= k
;
174 /* Sort the columns in increasing order */
175 gk_ikvsorti(pncols
, cand
);
178 /* Allocate space for the remaining fields of the projected matrix */
179 pmat
->ncols
= pncols
;
180 pmat
->colids
= pcolids
= gk_imalloc(pncols
, "itemsets_project_matrix: pcolids");
181 pmat
->colptr
= pcolptr
= gk_zmalloc(pncols
+1, "itemsets_project_matrix: pcolptr");
182 pmat
->colind
= pcolind
= gk_imalloc(pnnz
, "itemsets_project_matrix: pcolind");
185 /* Populate the projected matrix */
187 for (pnnz
=0, ii
=0; ii
<pncols
; ii
++) {
189 for (j
=colptr
[i
]; j
<colptr
[i
+1]; j
++) {
190 if (rmarker
[colind
[j
]])
191 pcolind
[pnnz
++] = colind
[j
];
194 pcolids
[ii
] = colids
[i
];
195 pcolptr
[ii
+1] = pnnz
;
199 /* Reset the rmarker array */
200 if (cid
== -1) { /* Initial projection */
201 gk_iset(nrows
, 0, rmarker
);
203 else { /* The other projections */
204 for (i
=colptr
[cid
]; i
<colptr
[cid
+1]; i
++)
205 rmarker
[colind
[i
]] = 0;