28 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
38 typedef unsigned char uint8_t;
39 typedef unsigned short uint16_t;
40 typedef unsigned int uint32_t;
48 const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
49 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
50 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
51 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
52 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
53 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
56 const uint32_t CODE_POINT_MAX = 0x0010ffffu;
58 template<
typename octet_type>
59 inline uint8_t mask8(octet_type oc)
61 return static_cast<uint8_t
>(0xff & oc);
63 template<
typename u16_type>
64 inline uint16_t mask16(u16_type oc)
66 return static_cast<uint16_t
>(0xffff & oc);
68 template<
typename octet_type>
69 inline bool is_trail(octet_type oc)
71 return ((utf8::internal::mask8(oc) >> 6) == 0x2);
74 template <
typename u16>
75 inline bool is_lead_surrogate(u16 cp)
77 return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
80 template <
typename u16>
81 inline bool is_trail_surrogate(u16 cp)
83 return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
86 template <
typename u16>
87 inline bool is_surrogate(u16 cp)
89 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
92 template <
typename u32>
93 inline bool is_code_point_valid(u32 cp)
95 return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
98 template <
typename octet_iterator>
99 inline typename std::iterator_traits<octet_iterator>::difference_type
100 sequence_length(octet_iterator lead_it)
102 uint8_t lead = utf8::internal::mask8(*lead_it);
105 else if ((lead >> 5) == 0x6)
107 else if ((lead >> 4) == 0xe)
109 else if ((lead >> 3) == 0x1e)
115 template <
typename octet_difference_type>
116 inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
122 else if (cp < 0x800) {
126 else if (cp < 0x10000) {
134 enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
137 template <
typename octet_iterator>
138 utf_error increase_safely(octet_iterator& it, octet_iterator end)
141 return NOT_ENOUGH_ROOM;
143 if (!utf8::internal::is_trail(*it))
144 return INCOMPLETE_SEQUENCE;
149 #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
152 template <
typename octet_iterator>
153 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
156 return NOT_ENOUGH_ROOM;
158 code_point = utf8::internal::mask8(*it);
163 template <
typename octet_iterator>
164 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
167 return NOT_ENOUGH_ROOM;
169 code_point = utf8::internal::mask8(*it);
171 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
173 code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
178 template <typename octet_iterator>
179 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
182 return NOT_ENOUGH_ROOM;
184 code_point = utf8::internal::mask8(*it);
186 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
188 code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
190 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
192 code_point += (*it) & 0x3f;
197 template <typename octet_iterator>
198 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
201 return NOT_ENOUGH_ROOM;
203 code_point = utf8::internal::mask8(*it);
205 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
207 code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
209 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
211 code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
213 UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
215 code_point += (*it) & 0x3f;
220 #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
222 template <
typename octet_iterator>
223 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
227 octet_iterator original_it = it;
231 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
232 const octet_difference_type length = utf8::internal::sequence_length(it);
235 utf_error err = UTF8_OK;
240 err = utf8::internal::get_sequence_1(it, end, cp);
243 err = utf8::internal::get_sequence_2(it, end, cp);
246 err = utf8::internal::get_sequence_3(it, end, cp);
249 err = utf8::internal::get_sequence_4(it, end, cp);
253 if (err == UTF8_OK) {
255 if (utf8::internal::is_code_point_valid(cp)) {
256 if (!utf8::internal::is_overlong_sequence(cp, length)){
263 err = OVERLONG_SEQUENCE;
266 err = INVALID_CODE_POINT;
274 template <
typename octet_iterator>
275 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
277 return utf8::internal::validate_next(it, end, ignored);
285 const uint8_t bom[] = {0xef, 0xbb, 0xbf};
287 template <
typename octet_iterator>
288 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
290 octet_iterator result = start;
291 while (result != end) {
292 utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
293 if (err_code != internal::UTF8_OK)
299 template <
typename octet_iterator>
300 inline bool is_valid(octet_iterator start, octet_iterator end)
302 return (utf8::find_invalid(start, end) == end);
305 template <
typename octet_iterator>
306 inline bool starts_with_bom (octet_iterator it, octet_iterator end)
309 ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
310 ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
311 ((it != end) && (utf8::internal::mask8(*it)) == bom[2])
316 template <
typename octet_iterator>
317 inline bool is_bom (octet_iterator it)
320 (utf8::internal::mask8(*it++)) == bom[0] &&
321 (utf8::internal::mask8(*it++)) == bom[1] &&
322 (utf8::internal::mask8(*it)) == bom[2]
327 #endif // header guard