Postfix3.3.1
casefold.c
[詳解]
1 /*++
2 /* NAME
3 /* casefold 3
4 /* SUMMARY
5 /* casefold text for caseless comparison
6 /* SYNOPSIS
7 /* #include <stringops.h>
8 /*
9 /* char *casefold(
10 /* VSTRING *dst,
11 /* const char *src)
12 /*
13 /* char *casefold_append(
14 /* VSTRING *dst,
15 /* const char *src)
16 /*
17 /* char *casefold_len(
18 /* VSTRING *dst,
19 /* const char *src,
20 /* ssize_t src_len)
21 /* AUXILIARY FUNCTIONS
22 /* char *casefoldx(
23 /* int flags,
24 /* VSTRING *dst,
25 /* const char *src,
26 /* ssize_t src_len)
27 /* DESCRIPTION
28 /* casefold() converts text to a form that is suitable for
29 /* caseless comparison, rather than presentation to humans.
30 /*
31 /* When compiled without EAI support or util_utf8_enable is
32 /* zero, casefold() implements ASCII case folding, leaving
33 /* non-ASCII byte values unchanged.
34 /*
35 /* When compiled with EAI support and util_utf8_enable is
36 /* non-zero, casefold() implements UTF-8 case folding using
37 /* the en_US locale, as recommended when the conversion result
38 /* is not meant to be presented to humans.
39 /*
40 /* casefold_len() implements casefold() with a source length
41 /* argument.
42 /*
43 /* casefold_append() implements casefold() without overwriting
44 /* the result.
45 /*
46 /* casefoldx() implements a more complex API that implements
47 /* all of the above and more.
48 /*
49 /* Arguments:
50 /* .IP src
51 /* Null-terminated input string.
52 /* .IP dest
53 /* Output buffer, null-terminated. Specify a null pointer to
54 /* use an internal buffer that is overwritten upon each call.
55 /* .IP src_len
56 /* The string length, -1 to determine the length dynamically.
57 /* .IP flags
58 /* Bitwise OR of zero or more of the following:
59 /* .RS
60 /* .IP CASEF_FLAG_UTF8
61 /* Enable UTF-8 support. This flag has no effect when compiled
62 /* without EAI support.
63 /* .IP CASEF_FLAG_APPEND
64 /* Append the result to the buffer, instead of overwriting it.
65 /* DIAGNOSTICS
66 /* All errors are fatal. There appear to be no input-dependent
67 /* errors.
68 /*
69 /* With the ICU 4.8 library, there is no casefold error for
70 /* UTF-8 code points U+0000..U+10FFFF (including surrogate
71 /* range), not even when running inside an empty chroot jail.
72 /* Nor does malformed UTF-8 trigger errors; non-UTF-8 bytes
73 /* are copied verbatim. Based on ICU 4.8 source-code review
74 /* and experimentation(!) we conclude that UTF-8 casefolding
75 /* has no data-dependent error cases, and that it is safe to
76 /* treat all casefolding errors as fatal runtime errors.
77 /* LICENSE
78 /* .ad
79 /* .fi
80 /* The Secure Mailer license must be distributed with this software.
81 /* AUTHOR(S)
82 /* Wietse Venema
83 /* IBM T.J. Watson Research
84 /* P.O. Box 704
85 /* Yorktown Heights, NY 10598, USA
86 /*
87 /* Wietse Venema
88 /* Google, Inc.
89 /* 111 8th Avenue
90 /* New York, NY 10011, USA
91 /*--*/
92 
93 /* System library. */
94 
95 #include <sys_defs.h>
96 #include <string.h>
97 #include <ctype.h>
98 #ifndef NO_EAI
99 #include <unicode/ucasemap.h>
100 #include <unicode/ustring.h>
101 #include <unicode/uchar.h>
102 #endif
103 
104 /* Utility library. */
105 
106 #include <msg.h>
107 #include <stringops.h>
108 
109 #define STR(x) vstring_str(x)
110 #define LEN(x) VSTRING_LEN(x)
111 
112 /* casefoldx - casefold an UTF-8 string */
113 
114 char *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len)
115 {
116  size_t old_len;
117 
118 #ifdef NO_EAI
119 
120  /*
121  * ASCII mode only.
122  */
123  if (len < 0)
124  len = strlen(src);
125  if ((flags & CASEF_FLAG_APPEND) == 0)
126  VSTRING_RESET(dest);
127  old_len = VSTRING_LEN(dest);
128  vstring_strncat(dest, src, len);
129  lowercase(STR(dest) + old_len);
130  return (STR(dest));
131 #else
132 
133  /*
134  * Unicode mode.
135  */
136  const char myname[] = "casefold";
137  static VSTRING *fold_buf = 0;
138  static UCaseMap *csm = 0;
139  UErrorCode error;
140  ssize_t space_needed;
141  int n;
142 
143  /*
144  * Handle special cases.
145  */
146  if (len < 0)
147  len = strlen(src);
148  if (dest == 0)
149  dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100)));
150  if ((flags & CASEF_FLAG_APPEND) == 0)
151  VSTRING_RESET(dest);
152  old_len = VSTRING_LEN(dest);
153 
154  /*
155  * All-ASCII input, or ASCII mode only.
156  */
157  if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) {
158  vstring_strncat(dest, src, len);
159  lowercase(STR(dest) + old_len);
160  return (STR(dest));
161  }
162 
163  /*
164  * ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax
165  * errors. XXX Based on source-code review we conclude that non-UTF-8
166  * bytes are copied verbatim, and experiments confirm this. Given that
167  * this behavior is intentional, we assume that it will stay that way.
168  */
169 #if 0
170  if (valid_utf8_string(src, len) == 0) {
171  if (err)
172  *err = "malformed UTF-8 or invalid codepoint";
173  return (0);
174  }
175 #endif
176 
177  /*
178  * One-time initialization. With ICU 4.8 this works while chrooted.
179  */
180  if (csm == 0) {
181  error = U_ZERO_ERROR;
182  csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error);
183  if (U_SUCCESS(error) == 0)
184  msg_fatal("ucasemap_open error: %s", u_errorName(error));
185  }
186 
187  /*
188  * Fold the input, adjusting the buffer size if needed. Safety: don't
189  * loop forever.
190  *
191  * Note: the requested amount of space for casemapped output (as reported
192  * with space_needed below) does not include storage for the null
193  * terminator. The terminator is written only when the output buffer is
194  * large enough. This is why we overallocate space when the output does
195  * not fit. But if the output fits exactly, then the output will be
196  * unterminated, and we have to terminate the output ourselves.
197  */
198  for (n = 0; n < 3; n++) {
199  error = U_ZERO_ERROR;
200  space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len,
201  vstring_avail(dest), src, len, &error);
202  if (U_SUCCESS(error)) {
203  VSTRING_AT_OFFSET(dest, old_len + space_needed);
204  if (vstring_avail(dest) == 0) /* exact fit, no terminator */
205  VSTRING_TERMINATE(dest); /* add terminator */
206  break;
207  } else if (error == U_BUFFER_OVERFLOW_ERROR) {
208  VSTRING_SPACE(dest, space_needed + 1); /* for terminator */
209  } else {
210  msg_fatal("%s: conversion error for \"%s\": %s",
211  myname, src, u_errorName(error));
212  }
213  }
214  return (STR(dest));
215 #endif /* NO_EAI */
216 }
217 
218 #ifdef TEST
219 
220 static void encode_utf8(VSTRING *buffer, int codepoint)
221 {
222  const char myname[] = "encode_utf8";
223 
224  VSTRING_RESET(buffer);
225  if (codepoint < 0x80) {
226  VSTRING_ADDCH(buffer, codepoint);
227  } else if (codepoint < 0x800) {
228  VSTRING_ADDCH(buffer, 0xc0 | (codepoint >> 6));
229  VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
230  } else if (codepoint < 0x10000) {
231  VSTRING_ADDCH(buffer, 0xe0 | (codepoint >> 12));
232  VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
233  VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
234  } else if (codepoint <= 0x10FFFF) {
235  VSTRING_ADDCH(buffer, 0xf0 | (codepoint >> 18));
236  VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 12) & 0x3f));
237  VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
238  VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
239  } else {
240  msg_panic("%s: out-of-range codepoint U+%X", myname, codepoint);
241  }
242  VSTRING_TERMINATE(buffer);
243 }
244 
245 #include <stdlib.h>
246 #include <stdio.h>
247 #include <locale.h>
248 
249 #include <vstream.h>
250 #include <vstring_vstream.h>
251 #include <msg_vstream.h>
252 
253 int main(int argc, char **argv)
254 {
255  VSTRING *buffer = vstring_alloc(1);
256  VSTRING *dest = vstring_alloc(1);
257  char *bp;
258  char *conv_res;
259  char *cmd;
260  int codepoint, first, last;
261  VSTREAM *fp;
262 
263  if (setlocale(LC_ALL, "C") == 0)
264  msg_fatal("setlocale(LC_ALL, C) failed: %m");
265 
266  msg_vstream_init(argv[0], VSTREAM_ERR);
267 
268  util_utf8_enable = 1;
269 
270  VSTRING_SPACE(buffer, 256); /* chroot/file pathname */
271 
272  while (vstring_fgets_nonl(buffer, VSTREAM_IN)) {
273  bp = STR(buffer);
274  vstream_printf("> %s\n", bp);
275  cmd = mystrtok(&bp, CHARS_SPACE);
276  if (cmd == 0 || *cmd == '#')
277  continue;
278  while (ISSPACE(*bp))
279  bp++;
280 
281  /*
282  * Null-terminated string.
283  */
284  if (strcmp(cmd, "fold") == 0) {
285  conv_res = casefold(dest, bp);
286  vstream_printf("\"%s\" ->fold \"%s\"\n", bp, conv_res);
287  }
288 
289  /*
290  * Codepoint range.
291  */
292  else if (strcmp(cmd, "range") == 0
293  && sscanf(bp, "%i %i", &first, &last) == 2
294  && first <= last) {
295  for (codepoint = first; codepoint <= last; codepoint++) {
296  if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
297  vstream_printf("skipping surrogate range\n");
298  codepoint = 0xDFFF;
299  } else {
300  encode_utf8(buffer, codepoint);
301  if (msg_verbose)
302  vstream_printf("U+%X -> %s\n", codepoint, STR(buffer));
303  if (valid_utf8_string(STR(buffer), LEN(buffer)) == 0)
304  msg_fatal("bad utf-8 encoding for U+%X\n", codepoint);
305  casefold(dest, STR(buffer));
306  }
307  }
308  vstream_printf("range completed: 0x%x..0x%x\n", first, last);
309  }
310 
311  /*
312  * Chroot directory.
313  */
314  else if (strcmp(cmd, "chroot") == 0
315  && sscanf(bp, "%255s", STR(buffer)) == 1) {
316  if (geteuid() == 0) {
317  if (chdir(STR(buffer)) < 0)
318  msg_fatal("chdir(%s): %m", STR(buffer));
319  if (chroot(STR(buffer)) < 0)
320  msg_fatal("chroot(%s): %m", STR(buffer));
321  vstream_printf("chroot %s completed\n", STR(buffer));
322  }
323  }
324 
325  /*
326  * File.
327  */
328  else if (strcmp(cmd, "file") == 0
329  && sscanf(bp, "%255s", STR(buffer)) == 1) {
330  if ((fp = vstream_fopen(STR(buffer), O_RDONLY, 0)) == 0)
331  msg_fatal("open(%s): %m", STR(buffer));
332  while (vstring_fgets_nonl(buffer, fp))
333  vstream_printf("%s\n", casefold(dest, STR(buffer)));
334  vstream_fclose(fp);
335  }
336 
337  /*
338  * Verbose.
339  */
340  else if (strcmp(cmd, "verbose") == 0
341  && sscanf(bp, "%i", &msg_verbose) == 1) {
342  /* void */ ;
343  }
344 
345  /*
346  * Usage
347  */
348  else {
349  vstream_printf("Usage: %s chroot <path> | file <path> | fold <text> | range <first> <last> | verbose <int>\n",
350  argv[0]);
351  }
353  }
354  vstring_free(buffer);
355  vstring_free(dest);
356  exit(0);
357 }
358 
359 #endif /* TEST */
int msg_verbose
Definition: msg.c:177
#define vstring_fgets_nonl(s, p)
NORETURN msg_panic(const char *fmt,...)
Definition: msg.c:295
#define VSTREAM_OUT
Definition: vstream.h:67
int main(int argc, char **argv)
Definition: anvil.c:1010
int valid_utf8_string(const char *, ssize_t)
VSTRING * vstring_strncat(VSTRING *vp, const char *src, ssize_t len)
Definition: vstring.c:471
#define LEN(x)
Definition: casefold.c:110
#define VSTREAM_IN
Definition: vstream.h:66
char * mystrtok(char **src, const char *sep)
Definition: mystrtok.c:54
#define VSTRING_LEN(vp)
Definition: vstring.h:72
VSTREAM * vstream_fopen(const char *path, int flags, mode_t mode)
Definition: vstream.c:1241
#define casefold(dst, src)
Definition: stringops.h:67
#define VSTRING_TERMINATE(vp)
Definition: vstring.h:74
char * casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len)
Definition: casefold.c:114
#define VSTRING_ADDCH(vp, ch)
Definition: vstring.h:81
#define CASEF_FLAG_APPEND
Definition: stringops.h:61
int vstream_fclose(VSTREAM *stream)
Definition: vstream.c:1268
VSTREAM * vstream_printf(const char *fmt,...)
Definition: vstream.c:1335
#define VSTRING_RESET(vp)
Definition: vstring.h:77
VSTRING * vstring_alloc(ssize_t len)
Definition: vstring.c:353
#define CHARS_SPACE
Definition: sys_defs.h:1762
char * lowercase(char *string)
Definition: lowercase.c:34
#define vstring_avail(vp)
Definition: vstring.h:86
#define allascii(s)
Definition: stringops.h:66
NORETURN msg_fatal(const char *fmt,...)
Definition: msg.c:249
int vstream_fflush(VSTREAM *stream)
Definition: vstream.c:1257
#define VSTRING_SPACE(vp, len)
Definition: vstring.h:70
VSTRING * vstring_free(VSTRING *vp)
Definition: vstring.c:380
#define VSTRING_AT_OFFSET(vp, offset)
Definition: vstring.h:92
void msg_vstream_init(const char *name, VSTREAM *vp)
Definition: msg_vstream.c:77
#define ISSPACE(c)
Definition: sys_defs.h:1753
int util_utf8_enable
Definition: printable.c:47
#define CASEF_FLAG_UTF8
Definition: stringops.h:60
#define VSTREAM_ERR
Definition: vstream.h:68
#define STR(x)
Definition: casefold.c:109