Update copyright for 2022
[pgsql.git] / src / backend / utils / adt / encode.c
blobfeb3e830e4fd83011302403d7f453b963daacdfb
1 /*-------------------------------------------------------------------------
3 * encode.c
4 * Various data encoding/decoding things.
6 * Copyright (c) 2001-2022, PostgreSQL Global Development Group
9 * IDENTIFICATION
10 * src/backend/utils/adt/encode.c
12 *-------------------------------------------------------------------------
14 #include "postgres.h"
16 #include <ctype.h>
18 #include "mb/pg_wchar.h"
19 #include "utils/builtins.h"
20 #include "utils/memutils.h"
24 * Encoding conversion API.
25 * encode_len() and decode_len() compute the amount of space needed, while
26 * encode() and decode() perform the actual conversions. It is okay for
27 * the _len functions to return an overestimate, but not an underestimate.
28 * (Having said that, large overestimates could cause unnecessary errors,
29 * so it's better to get it right.) The conversion routines write to the
30 * buffer at *res and return the true length of their output.
32 struct pg_encoding
34 uint64 (*encode_len) (const char *data, size_t dlen);
35 uint64 (*decode_len) (const char *data, size_t dlen);
36 uint64 (*encode) (const char *data, size_t dlen, char *res);
37 uint64 (*decode) (const char *data, size_t dlen, char *res);
40 static const struct pg_encoding *pg_find_encoding(const char *name);
43 * SQL functions.
46 Datum
47 binary_encode(PG_FUNCTION_ARGS)
49 bytea *data = PG_GETARG_BYTEA_PP(0);
50 Datum name = PG_GETARG_DATUM(1);
51 text *result;
52 char *namebuf;
53 char *dataptr;
54 size_t datalen;
55 uint64 resultlen;
56 uint64 res;
57 const struct pg_encoding *enc;
59 namebuf = TextDatumGetCString(name);
61 enc = pg_find_encoding(namebuf);
62 if (enc == NULL)
63 ereport(ERROR,
64 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
65 errmsg("unrecognized encoding: \"%s\"", namebuf)));
67 dataptr = VARDATA_ANY(data);
68 datalen = VARSIZE_ANY_EXHDR(data);
70 resultlen = enc->encode_len(dataptr, datalen);
73 * resultlen possibly overflows uint32, therefore on 32-bit machines it's
74 * unsafe to rely on palloc's internal check.
76 if (resultlen > MaxAllocSize - VARHDRSZ)
77 ereport(ERROR,
78 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
79 errmsg("result of encoding conversion is too large")));
81 result = palloc(VARHDRSZ + resultlen);
83 res = enc->encode(dataptr, datalen, VARDATA(result));
85 /* Make this FATAL 'cause we've trodden on memory ... */
86 if (res > resultlen)
87 elog(FATAL, "overflow - encode estimate too small");
89 SET_VARSIZE(result, VARHDRSZ + res);
91 PG_RETURN_TEXT_P(result);
94 Datum
95 binary_decode(PG_FUNCTION_ARGS)
97 text *data = PG_GETARG_TEXT_PP(0);
98 Datum name = PG_GETARG_DATUM(1);
99 bytea *result;
100 char *namebuf;
101 char *dataptr;
102 size_t datalen;
103 uint64 resultlen;
104 uint64 res;
105 const struct pg_encoding *enc;
107 namebuf = TextDatumGetCString(name);
109 enc = pg_find_encoding(namebuf);
110 if (enc == NULL)
111 ereport(ERROR,
112 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
113 errmsg("unrecognized encoding: \"%s\"", namebuf)));
115 dataptr = VARDATA_ANY(data);
116 datalen = VARSIZE_ANY_EXHDR(data);
118 resultlen = enc->decode_len(dataptr, datalen);
121 * resultlen possibly overflows uint32, therefore on 32-bit machines it's
122 * unsafe to rely on palloc's internal check.
124 if (resultlen > MaxAllocSize - VARHDRSZ)
125 ereport(ERROR,
126 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
127 errmsg("result of decoding conversion is too large")));
129 result = palloc(VARHDRSZ + resultlen);
131 res = enc->decode(dataptr, datalen, VARDATA(result));
133 /* Make this FATAL 'cause we've trodden on memory ... */
134 if (res > resultlen)
135 elog(FATAL, "overflow - decode estimate too small");
137 SET_VARSIZE(result, VARHDRSZ + res);
139 PG_RETURN_BYTEA_P(result);
144 * HEX
147 static const char hextbl[] = "0123456789abcdef";
149 static const int8 hexlookup[128] = {
150 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
151 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
152 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
153 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
154 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
155 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
156 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
157 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
160 uint64
161 hex_encode(const char *src, size_t len, char *dst)
163 const char *end = src + len;
165 while (src < end)
167 *dst++ = hextbl[(*src >> 4) & 0xF];
168 *dst++ = hextbl[*src & 0xF];
169 src++;
171 return (uint64) len * 2;
174 static inline char
175 get_hex(const char *cp)
177 unsigned char c = (unsigned char) *cp;
178 int res = -1;
180 if (c < 127)
181 res = hexlookup[c];
183 if (res < 0)
184 ereport(ERROR,
185 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
186 errmsg("invalid hexadecimal digit: \"%.*s\"",
187 pg_mblen(cp), cp)));
189 return (char) res;
192 uint64
193 hex_decode(const char *src, size_t len, char *dst)
195 const char *s,
196 *srcend;
197 char v1,
201 srcend = src + len;
202 s = src;
203 p = dst;
204 while (s < srcend)
206 if (*s == ' ' || *s == '\n' || *s == '\t' || *s == '\r')
208 s++;
209 continue;
211 v1 = get_hex(s) << 4;
212 s++;
213 if (s >= srcend)
214 ereport(ERROR,
215 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
216 errmsg("invalid hexadecimal data: odd number of digits")));
218 v2 = get_hex(s);
219 s++;
220 *p++ = v1 | v2;
223 return p - dst;
226 static uint64
227 hex_enc_len(const char *src, size_t srclen)
229 return (uint64) srclen << 1;
232 static uint64
233 hex_dec_len(const char *src, size_t srclen)
235 return (uint64) srclen >> 1;
239 * BASE64
242 static const char _base64[] =
243 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
245 static const int8 b64lookup[128] = {
246 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
247 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
248 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
249 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
250 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
251 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
252 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
253 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
256 static uint64
257 pg_base64_encode(const char *src, size_t len, char *dst)
259 char *p,
260 *lend = dst + 76;
261 const char *s,
262 *end = src + len;
263 int pos = 2;
264 uint32 buf = 0;
266 s = src;
267 p = dst;
269 while (s < end)
271 buf |= (unsigned char) *s << (pos << 3);
272 pos--;
273 s++;
275 /* write it out */
276 if (pos < 0)
278 *p++ = _base64[(buf >> 18) & 0x3f];
279 *p++ = _base64[(buf >> 12) & 0x3f];
280 *p++ = _base64[(buf >> 6) & 0x3f];
281 *p++ = _base64[buf & 0x3f];
283 pos = 2;
284 buf = 0;
286 if (p >= lend)
288 *p++ = '\n';
289 lend = p + 76;
292 if (pos != 2)
294 *p++ = _base64[(buf >> 18) & 0x3f];
295 *p++ = _base64[(buf >> 12) & 0x3f];
296 *p++ = (pos == 0) ? _base64[(buf >> 6) & 0x3f] : '=';
297 *p++ = '=';
300 return p - dst;
303 static uint64
304 pg_base64_decode(const char *src, size_t len, char *dst)
306 const char *srcend = src + len,
307 *s = src;
308 char *p = dst;
309 char c;
310 int b = 0;
311 uint32 buf = 0;
312 int pos = 0,
313 end = 0;
315 while (s < srcend)
317 c = *s++;
319 if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
320 continue;
322 if (c == '=')
324 /* end sequence */
325 if (!end)
327 if (pos == 2)
328 end = 1;
329 else if (pos == 3)
330 end = 2;
331 else
332 ereport(ERROR,
333 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
334 errmsg("unexpected \"=\" while decoding base64 sequence")));
336 b = 0;
338 else
340 b = -1;
341 if (c > 0 && c < 127)
342 b = b64lookup[(unsigned char) c];
343 if (b < 0)
344 ereport(ERROR,
345 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
346 errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence",
347 pg_mblen(s - 1), s - 1)));
349 /* add it to buffer */
350 buf = (buf << 6) + b;
351 pos++;
352 if (pos == 4)
354 *p++ = (buf >> 16) & 255;
355 if (end == 0 || end > 1)
356 *p++ = (buf >> 8) & 255;
357 if (end == 0 || end > 2)
358 *p++ = buf & 255;
359 buf = 0;
360 pos = 0;
364 if (pos != 0)
365 ereport(ERROR,
366 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
367 errmsg("invalid base64 end sequence"),
368 errhint("Input data is missing padding, is truncated, or is otherwise corrupted.")));
370 return p - dst;
374 static uint64
375 pg_base64_enc_len(const char *src, size_t srclen)
377 /* 3 bytes will be converted to 4, linefeed after 76 chars */
378 return ((uint64) srclen + 2) * 4 / 3 + (uint64) srclen / (76 * 3 / 4);
381 static uint64
382 pg_base64_dec_len(const char *src, size_t srclen)
384 return ((uint64) srclen * 3) >> 2;
388 * Escape
389 * Minimally escape bytea to text.
390 * De-escape text to bytea.
392 * We must escape zero bytes and high-bit-set bytes to avoid generating
393 * text that might be invalid in the current encoding, or that might
394 * change to something else if passed through an encoding conversion
395 * (leading to failing to de-escape to the original bytea value).
396 * Also of course backslash itself has to be escaped.
398 * De-escaping processes \\ and any \### octal
401 #define VAL(CH) ((CH) - '0')
402 #define DIG(VAL) ((VAL) + '0')
404 static uint64
405 esc_encode(const char *src, size_t srclen, char *dst)
407 const char *end = src + srclen;
408 char *rp = dst;
409 uint64 len = 0;
411 while (src < end)
413 unsigned char c = (unsigned char) *src;
415 if (c == '\0' || IS_HIGHBIT_SET(c))
417 rp[0] = '\\';
418 rp[1] = DIG(c >> 6);
419 rp[2] = DIG((c >> 3) & 7);
420 rp[3] = DIG(c & 7);
421 rp += 4;
422 len += 4;
424 else if (c == '\\')
426 rp[0] = '\\';
427 rp[1] = '\\';
428 rp += 2;
429 len += 2;
431 else
433 *rp++ = c;
434 len++;
437 src++;
440 return len;
443 static uint64
444 esc_decode(const char *src, size_t srclen, char *dst)
446 const char *end = src + srclen;
447 char *rp = dst;
448 uint64 len = 0;
450 while (src < end)
452 if (src[0] != '\\')
453 *rp++ = *src++;
454 else if (src + 3 < end &&
455 (src[1] >= '0' && src[1] <= '3') &&
456 (src[2] >= '0' && src[2] <= '7') &&
457 (src[3] >= '0' && src[3] <= '7'))
459 int val;
461 val = VAL(src[1]);
462 val <<= 3;
463 val += VAL(src[2]);
464 val <<= 3;
465 *rp++ = val + VAL(src[3]);
466 src += 4;
468 else if (src + 1 < end &&
469 (src[1] == '\\'))
471 *rp++ = '\\';
472 src += 2;
474 else
477 * One backslash, not followed by ### valid octal. Should never
478 * get here, since esc_dec_len does same check.
480 ereport(ERROR,
481 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
482 errmsg("invalid input syntax for type %s", "bytea")));
485 len++;
488 return len;
491 static uint64
492 esc_enc_len(const char *src, size_t srclen)
494 const char *end = src + srclen;
495 uint64 len = 0;
497 while (src < end)
499 if (*src == '\0' || IS_HIGHBIT_SET(*src))
500 len += 4;
501 else if (*src == '\\')
502 len += 2;
503 else
504 len++;
506 src++;
509 return len;
512 static uint64
513 esc_dec_len(const char *src, size_t srclen)
515 const char *end = src + srclen;
516 uint64 len = 0;
518 while (src < end)
520 if (src[0] != '\\')
521 src++;
522 else if (src + 3 < end &&
523 (src[1] >= '0' && src[1] <= '3') &&
524 (src[2] >= '0' && src[2] <= '7') &&
525 (src[3] >= '0' && src[3] <= '7'))
528 * backslash + valid octal
530 src += 4;
532 else if (src + 1 < end &&
533 (src[1] == '\\'))
536 * two backslashes = backslash
538 src += 2;
540 else
543 * one backslash, not followed by ### valid octal
545 ereport(ERROR,
546 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
547 errmsg("invalid input syntax for type %s", "bytea")));
550 len++;
552 return len;
556 * Common
559 static const struct
561 const char *name;
562 struct pg_encoding enc;
563 } enclist[] =
567 "hex",
569 hex_enc_len, hex_dec_len, hex_encode, hex_decode
573 "base64",
575 pg_base64_enc_len, pg_base64_dec_len, pg_base64_encode, pg_base64_decode
579 "escape",
581 esc_enc_len, esc_dec_len, esc_encode, esc_decode
585 NULL,
587 NULL, NULL, NULL, NULL
592 static const struct pg_encoding *
593 pg_find_encoding(const char *name)
595 int i;
597 for (i = 0; enclist[i].name; i++)
598 if (pg_strcasecmp(enclist[i].name, name) == 0)
599 return &enclist[i].enc;
601 return NULL;