Postfix3.3.1
valid_utf8_string.c
[詳解]
1 /*++
2 /* NAME
3 /* valid_utf8_string 3
4 /* SUMMARY
5 /* predicate if string is valid UTF-8
6 /* SYNOPSIS
7 /* #include <stringops.h>
8 /*
9 /* int valid_utf8_string(str, len)
10 /* const char *str;
11 /* ssize_t len;
12 /* DESCRIPTION
13 /* valid_utf8_string() determines if a string satisfies the UTF-8
14 /* definition in RFC 3629. That is, it contains proper encodings
15 /* of code points U+0000..U+10FFFF, excluding over-long encodings
16 /* and excluding U+D800..U+DFFF surrogates.
17 /*
18 /* A zero-length string is considered valid.
19 /* DIAGNOSTICS
20 /* The result value is zero when the caller specifies a negative
21 /* length, or a string that violates RFC 3629, for example a
22 /* string that is truncated in the middle of a multi-byte
23 /* sequence.
24 /* BUGS
25 /* But wait, there is more. Code points in the range U+FDD0..U+FDEF
26 /* and ending in FFFE or FFFF are non-characters in UNICODE. This
27 /* function does not block these.
28 /* SEE ALSO
29 /* RFC 3629
30 /* LICENSE
31 /* .ad
32 /* .fi
33 /* The Secure Mailer license must be distributed with this software.
34 /* AUTHOR(S)
35 /* Wietse Venema
36 /* IBM T.J. Watson Research
37 /* P.O. Box 704
38 /* Yorktown Heights, NY 10598, USA
39 /*--*/
40 
41 /* System library. */
42 
43 #include <sys_defs.h>
44 
45 /* Utility library. */
46 
47 #include <stringops.h>
48 
49 /* valid_utf8_string - validate string according to RFC 3629 */
50 
51 int valid_utf8_string(const char *str, ssize_t len)
52 {
53  const unsigned char *end = (const unsigned char *) str + len;
54  const unsigned char *cp;
55  unsigned char c0, ch;
56 
57  if (len < 0)
58  return (0);
59  if (len <= 0)
60  return (1);
61 
62  /*
63  * Optimized for correct input, time, space, and for CPUs that have a
64  * decent number of registers.
65  */
66  for (cp = (const unsigned char *) str; cp < end; cp++) {
67  /* Single-byte encodings. */
68  if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
69  /* void */ ;
70  }
71  /* Two-byte encodings. */
72  else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
73  /* Exclude over-long encodings. */
74  if (UNEXPECTED(c0 < 0xc2)
75  || UNEXPECTED(cp + 1 >= end)
76  /* Require UTF-8 tail byte. */
77  || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
78  return (0);
79  }
80  /* Three-byte encodings. */
81  else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
82  if (UNEXPECTED(cp + 2 >= end)
83  /* Exclude over-long encodings. */
84  || UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
85  /* Exclude U+D800..U+DFFF. */
86  || UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
87  /* Require UTF-8 tail byte. */
88  || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
89  return (0);
90  }
91  /* Four-byte encodings. */
92  else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
93  if (UNEXPECTED(cp + 3 >= end)
94  /* Exclude over-long encodings. */
95  || UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
96  /* Exclude code points above U+10FFFF. */
97  || UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
98  /* Require UTF-8 tail byte. */
99  || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)
100  /* Require UTF-8 tail byte. */
101  || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
102  return (0);
103  }
104  /* Invalid: c0 >= 0xf5 */
105  else {
106  return (0);
107  }
108  }
109  return (1);
110 }
111 
112  /*
113  * Stand-alone test program. Each string is a line without line terminator.
114  */
115 #ifdef TEST
116 #include <stdlib.h>
117 #include <vstream.h>
118 #include <vstring.h>
119 #include <vstring_vstream.h>
120 
121 #define STR(x) vstring_str(x)
122 #define LEN(x) VSTRING_LEN(x)
123 
124 int main(void)
125 {
126  VSTRING *buf = vstring_alloc(1);
127 
128  while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) {
129  vstream_printf("%c", (LEN(buf) && !valid_utf8_string(STR(buf), LEN(buf))) ?
130  '!' : ' ');
131  vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf));
132  vstream_printf("\n");
133  }
135  vstring_free(buf);
136  exit(0);
137 }
138 
139 #endif
#define VSTREAM_EOF
Definition: vstream.h:110
int vstring_get_nonl(VSTRING *vp, VSTREAM *fp)
#define VSTREAM_OUT
Definition: vstream.h:67
int main(int argc, char **argv)
Definition: anvil.c:1010
#define LEN
Definition: cleanup_addr.c:106
#define VSTREAM_IN
Definition: vstream.h:66
#define EXPECTED(x)
Definition: sys_defs.h:1638
VSTREAM * vstream_printf(const char *fmt,...)
Definition: vstream.c:1335
#define STR(x)
Definition: anvil.c:518
VSTRING * vstring_alloc(ssize_t len)
Definition: vstring.c:353
int vstream_fflush(VSTREAM *stream)
Definition: vstream.c:1257
#define vstream_fwrite(v, b, n)
Definition: vstream.h:105
VSTRING * vstring_free(VSTRING *vp)
Definition: vstring.c:380
int valid_utf8_string(const char *str, ssize_t len)
#define UNEXPECTED(x)
Definition: sys_defs.h:1639