pc+-file-reader: Fix memory leak.
[pspp.git] / src / data / sys-file-private.c
blob9114f54e2fda289a4c3a60a962434845c8aec48b
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2009, 2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include "data/sys-file-private.h"
21 #include "data/dictionary.h"
22 #include "data/value.h"
23 #include "data/variable.h"
24 #include "libpspp/assertion.h"
25 #include "libpspp/misc.h"
27 #include "gl/c-strcase.h"
28 #include "gl/minmax.h"
29 #include "gl/xalloc.h"
31 /* Number of bytes really stored in each segment of a very long
32 string variable. */
33 #define REAL_VLS_CHUNK 255
35 /* Number of bytes per segment by which the amount of space for
36 very long string variables is allocated. */
37 #define EFFECTIVE_VLS_CHUNK 252
39 /* Returns true if WIDTH is a very long string width,
40 false otherwise. */
41 static bool
42 is_very_long (int width)
44 return width >= 256;
47 /* Returns the smaller of A or B.
48 (Defined as a function to avoid evaluating A or B more than
49 once.) */
50 static int
51 min_int (int a, int b)
53 return MIN (a, b);
56 /* Returns the larger of A or B.
57 (Defined as a function to avoid evaluating A or B more than
58 once.) */
59 static int
60 max_int (int a, int b)
62 return MAX (a, b);
65 /* Returns the number of bytes of uncompressed case data used for
66 writing a variable of the given WIDTH to a system file. All
67 required space is included, including trailing padding and
68 internal padding. */
69 static int
70 sfm_width_to_bytes (int width)
72 int bytes;
74 assert (width >= 0);
76 if (width == 0)
77 bytes = 8;
78 else if (!is_very_long (width))
79 bytes = width;
80 else
82 int chunks = width / EFFECTIVE_VLS_CHUNK;
83 int remainder = width % EFFECTIVE_VLS_CHUNK;
84 bytes = remainder + (chunks * ROUND_UP (REAL_VLS_CHUNK, 8));
86 return ROUND_UP (bytes, 8);
89 /* Returns the number of 8-byte units (octs) used to write data
90 for a variable of the given WIDTH. */
91 int
92 sfm_width_to_octs (int width)
94 return sfm_width_to_bytes (width) / 8;
97 /* Returns the number of "segments" used for writing case data
98 for a variable of the given WIDTH. A segment is a physical
99 variable in the system file that represents some piece of a
100 logical variable as seen by a PSPP user. Only very long
101 string variables have more than one segment. */
103 sfm_width_to_segments (int width)
105 assert (width >= 0);
107 return !is_very_long (width) ? 1 : DIV_RND_UP (width, EFFECTIVE_VLS_CHUNK);
110 /* Returns the width to allocate to the given SEGMENT within a
111 variable of the given WIDTH. A segment is a physical variable
112 in the system file that represents some piece of a logical
113 variable as seen by a PSPP user. */
115 sfm_segment_alloc_width (int width, int segment)
117 assert (segment < sfm_width_to_segments (width));
119 return (!is_very_long (width) ? width
120 : segment < sfm_width_to_segments (width) - 1 ? 255
121 : width - segment * EFFECTIVE_VLS_CHUNK);
124 /* Returns the number of bytes to allocate to the given SEGMENT
125 within a variable of the given width. This is the same as
126 sfm_segment_alloc_width, except that a numeric value takes up
127 8 bytes despite having a width of 0. */
128 static int
129 sfm_segment_alloc_bytes (int width, int segment)
131 assert (segment < sfm_width_to_segments (width));
132 return (width == 0 ? 8
133 : ROUND_UP (sfm_segment_alloc_width (width, segment), 8));
136 /* Returns the number of bytes in the given SEGMENT within a
137 variable of the given WIDTH that are actually used to store
138 data. For a numeric value (WIDTH of 0), this is 8 bytes; for
139 a string value less than 256 bytes wide, it is WIDTH bytes.
140 For very long string values, the calculation is more
141 complicated and ranges between 255 bytes for the first segment
142 to as little as 0 bytes for final segments. */
143 static int
144 sfm_segment_used_bytes (int width, int segment)
146 assert (segment < sfm_width_to_segments (width));
147 return (width == 0 ? 8
148 : !is_very_long (width) ? width
149 : max_int (0, min_int (width - REAL_VLS_CHUNK * segment,
150 REAL_VLS_CHUNK)));
153 /* Returns the number of bytes at the end of the given SEGMENT
154 within a variable of the given WIDTH that are not used for
155 data; that is, the number of bytes that must be padded with
156 data that a reader ignores. */
157 static int
158 sfm_segment_padding (int width, int segment)
160 return (sfm_segment_alloc_bytes (width, segment)
161 - sfm_segment_used_bytes (width, segment));
164 /* Returns the byte offset of the start of the given SEGMENT
165 within a variable of the given WIDTH. The first segment
166 starts at offset 0; only very long string variables have any
167 other segments. */
168 static int
169 sfm_segment_offset (int width, int segment)
171 assert (segment < sfm_width_to_segments (width));
172 return min_int (REAL_VLS_CHUNK * segment, width);
175 /* Returns the byte offset of the start of the given SEGMENT
176 within a variable of the given WIDTH, given the (incorrect)
177 assumption that there are EFFECTIVE_VLS_CHUNK bytes per
178 segment. (Use of this function is questionable at best.) */
180 sfm_segment_effective_offset (int width, int segment)
182 assert (segment < sfm_width_to_segments (width));
183 return EFFECTIVE_VLS_CHUNK * segment;
186 /* Creates and initializes an array of struct sfm_vars that
187 describe how a case drawn from dictionary DICT is laid out in
188 a system file. Returns the number of segments in a case. A
189 segment is a physical variable in the system file that
190 represents some piece of a logical variable as seen by a PSPP
191 user.
193 The array is allocated with malloc and stored in *SFM_VARS,
194 and its number of elements is stored in *SFM_VAR_CNT. The
195 caller is responsible for freeing it when it is no longer
196 needed. */
198 sfm_dictionary_to_sfm_vars (const struct dictionary *dict,
199 struct sfm_var **sfm_vars, size_t *sfm_var_cnt)
201 size_t var_cnt = dict_get_var_cnt (dict);
202 size_t segment_cnt;
203 size_t i;
205 /* Estimate the number of sfm_vars that will be needed.
206 We might not need all of these, because very long string
207 variables can have segments that are all padding, which do
208 not need sfm_vars of their own. */
209 segment_cnt = 0;
210 for (i = 0; i < var_cnt; i++)
212 const struct variable *v = dict_get_var (dict, i);
213 segment_cnt += sfm_width_to_segments (var_get_width (v));
216 /* Compose the sfm_vars. */
217 *sfm_vars = xnmalloc (segment_cnt, sizeof **sfm_vars);
218 *sfm_var_cnt = 0;
219 for (i = 0; i < var_cnt; i++)
221 const struct variable *dv = dict_get_var (dict, i);
222 int width = var_get_width (dv);
223 int j;
225 for (j = 0; j < sfm_width_to_segments (width); j++)
227 int used_bytes = sfm_segment_used_bytes (width, j);
228 int padding = sfm_segment_padding (width, j);
229 struct sfm_var *sv;
230 if (used_bytes != 0)
232 sv = &(*sfm_vars)[(*sfm_var_cnt)++];
233 sv->var_width = width;
234 sv->segment_width = width == 0 ? 0 : used_bytes;
235 sv->case_index = var_get_case_index (dv);
236 sv->offset = sfm_segment_offset (width, j);
237 sv->padding = padding;
239 else
241 /* Segment is all padding. Just add it to the
242 previous segment. */
243 sv = &(*sfm_vars)[*sfm_var_cnt - 1];
244 sv->padding += padding;
246 assert ((sv->segment_width + sv->padding) % 8 == 0);
250 return segment_cnt;
253 /* Given the name of an encoding, returns the codepage number to use in the
254 'character_code' member of the machine integer info record for writing a
255 system file. */
257 sys_get_codepage_from_encoding (const char *name)
259 const struct sys_encoding *e;
261 for (e = sys_codepage_name_to_number; e->name != NULL; e++)
262 if (!c_strcasecmp (name, e->name))
263 return e->number;
265 return 0;
268 /* Given a codepage number from the 'character_code' member of the machine
269 integer info record in a system file, returns a corresponding encoding name.
270 Most encodings have multiple aliases; the one returned is the one that would
271 be used in the character encoding record. */
272 const char *
273 sys_get_encoding_from_codepage (int codepage)
275 const struct sys_encoding *e;
277 for (e = sys_codepage_number_to_name; e->name != NULL; e++)
278 if (codepage == e->number)
279 return e->name;
281 return NULL;