// // See the file LICENSE for redistribution information. // // Copyright (c) 2002,2009 Oracle. All rights reserved. // // // Copied from DB XML code (all of UTF8, subset of NsUtil) #include #include "dbxml/DbXml.hpp" #include "Transcoding.hpp" using namespace DbXmlTest; using namespace DbXml; XERCES_CPP_NAMESPACE_USE UTF8ToXMLCh::UTF8ToXMLCh(const std::string &s) { uint32_t l = (uint32_t)s.length(); xmlch_ = (XMLCh *) NsUtil::allocate((l + 1) << 1); // len_ does not get trailing null len_ = NsUtil::nsFromUTF8(&xmlch_, (const xmlbyte_t *)s.c_str(), l + 1, l + 1) - 1; } // string may not be null terminated. UTF8ToXMLCh::UTF8ToXMLCh(const char *s, size_t l) { xmlch_ = (XMLCh *) NsUtil::allocate((l + 1) << 1); // len_ does not get trailing null, so l must not include it. len_ = NsUtil::nsFromUTF8(&xmlch_, (const xmlbyte_t *)s, (uint32_t)l, (uint32_t)l + 1); xmlch_[l] = 0; // Add null terminator } UTF8ToXMLCh::~UTF8ToXMLCh() { delete [] xmlch_; } const XMLCh *UTF8ToXMLCh::str() const { return xmlch_; } int UTF8ToXMLCh::len() const { return len_; } XMLCh *UTF8ToXMLCh::adopt() { XMLCh *result = xmlch_; xmlch_ = 0; return result; } XMLChToUTF8::XMLChToUTF8(const XMLCh* const toTranscode, int len) { if(toTranscode == 0) { p_ = (xmlbyte_t *)NsUtil::allocate(1); *p_ = 0; len_ = 0; } else { if (!len) len = NsUtil::nsStringLen(toTranscode); // 3 bytes per XMLCh is the worst case, + '\0' const unsigned int needed = len * 3 + 1; p_ = (xmlbyte_t *) NsUtil::allocate(needed); // len_ does not get trailing null len_ = NsUtil::nsToUTF8(&p_, toTranscode, len + 1, needed, 0) - 1; } } XMLChToUTF8::~XMLChToUTF8() { if (p_) delete [] p_; } const char *XMLChToUTF8::str() const { return (const char *)p_; } int XMLChToUTF8::len() const { return len_; } char *XMLChToUTF8::adopt() { char *result = (char *)p_; p_ = 0; return result; } /* * Transcoding globals (from Xerces) * * gUTFBytes * A list of counts of trailing bytes for each initial byte in the input. * * gUTFByteIndicator * For a UTF8 sequence of n bytes, n>=2, the first byte of the * sequence must contain n 1's followed by precisely 1 0 with the * rest of the byte containing arbitrary bits. This array stores * the required bit pattern for validity checking. * * gUTFByteIndicatorTest * When bitwise and'd with the observed value, if the observed * value is correct then a result matching gUTFByteIndicator will * be produced. * * gUTFOffsets * A list of values to offset each result char type, according to how * many source bytes when into making it. * * gFirstByteMark * A list of values to mask onto the first byte of an encoded sequence, * indexed by the number of bytes used to create the sequence. */ static const xmlbyte_t gUTFBytes[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; static const xmlbyte_t gUTFByteIndicator[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; static const xmlbyte_t gUTFByteIndicatorTest[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE }; static const uint32_t gUTFOffsets[6] = { 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080 }; static const xmlbyte_t gFirstByteMark[7] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; #define CHECK_ENT(c) \ if (hasEntity && ((c) < chLatin_A)) { \ if (((c) == chAmpersand) || ((c) == chOpenAngle) || \ ((c) == chCloseAngle) || ((c) == chDoubleQuote) || \ ((c) == chSingleQuote)) *hasEntity = true; \ } /* * transcode from UTF-16 to UTF-8 * o will (re)allocate destination buffer if not present. * start out with 3x the number of characters. This memory * should not hang around long. * o returns number of *bytes* in the transcoded buffer. * o code is derived from the Xerces class, XMLUTF8Transcoder * o returns true or false in hasEntity if a special entity is * encountered: &, <, >, ", ' * No error checking -- the UTF-16 input was created by Xerces, and * is trusted. */ int NsUtil::nsToUTF8(xmlbyte_t **dest, const xmlch_t *src, uint32_t nchars, uint32_t maxbytes, bool *hasEntity) { uint32_t bufsize; xmlbyte_t *outPtr = *dest; if (hasEntity) *hasEntity = false; if (!outPtr) { bufsize = nchars * 3; outPtr = (xmlbyte_t *) NsUtil::allocate(bufsize); *dest = outPtr; } else { bufsize = maxbytes; } const xmlch_t *srcEnd = src + nchars; const xmlbyte_t *outEnd = outPtr + bufsize; while (src < srcEnd) { uint32_t curVal = *src; // deal with surrogates if ((curVal >= 0xd800) && (curVal <= 0xdbff)) { curVal = ((curVal - 0xd800) << 10) + ((*(src + 1) - 0xdc00) + 0x10000); src++; } src++; CHECK_ENT(curVal); // how many bytes uint32_t nbytes; if (curVal < 0x80) nbytes = 1; else if (curVal < 0x800) nbytes = 2; else if (curVal < 0x10000) nbytes = 3; else if (curVal < 0x200000) nbytes = 4; else if (curVal < 0x4000000) nbytes = 5; else if (curVal <= 0x7FFFFFFF) nbytes = 6; if (outPtr + nbytes > outEnd) { // reallocate (double size), or fail uint32_t count = (uint32_t)(outPtr - *dest); xmlbyte_t *newmem = (xmlbyte_t *) NsUtil::allocate(bufsize << 1); memcpy(newmem, *dest, bufsize); bufsize <<= 1; NsUtil::deallocate(*dest); *dest = newmem; outEnd = newmem + bufsize; outPtr = newmem + count; } /* optimize 1-byte case */ if (nbytes == 1) { *outPtr++ = (xmlbyte_t) curVal; } else { outPtr += nbytes; // work backwards switch(nbytes) { case 6 : *--outPtr = xmlbyte_t((curVal | 0x80UL) & 0xbfUL); curVal >>= 6; case 5 : *--outPtr = xmlbyte_t((curVal | 0x80UL) & 0xbfUL); curVal >>= 6; case 4 : *--outPtr = xmlbyte_t((curVal | 0x80UL) & 0xbfUL); curVal >>= 6; case 3 : *--outPtr = xmlbyte_t((curVal | 0x80UL) & 0xbfUL); curVal >>= 6; case 2 : *--outPtr = xmlbyte_t((curVal | 0x80UL) & 0xbfUL); curVal >>= 6; case 1 : *--outPtr = xmlbyte_t (curVal | gFirstByteMark[nbytes]); } outPtr += nbytes; // add bytes back again } } // while return (int)(outPtr - *dest); } static inline void checkTrailingBytes(const xmlbyte_t toCheck) { if ((toCheck & 0xC0) != 0x80) NsUtil::nsThrowException(XmlException::INVALID_VALUE, "checkTrailingBytes: bad utf-8 encoding", __FILE__, __LINE__); } /* * _nsFromUTF8 -- internal method for transcoding from utf-8 to * utf-16 * * This method returns how much of the original * utf-8 string is consumed in the eaten paramter. * This allows long strings to be transcoded in chunks. * * The code is basically structured to allow multiple chunk calls. * This is derived from Xerces transcoding code. * * The string is not entirely trusted as a valid utf-8 encoding. */ static int _nsFromUTF8(xmlch_t *dest, const unsigned char *src, uint32_t nbytes, uint32_t maxchars, uint32_t &eaten) { NS_ASSERT(nbytes) const xmlbyte_t* srcP = (const xmlbyte_t *) src; const xmlbyte_t* endP = srcP + nbytes; xmlch_t *outP = dest; xmlch_t *outEnd = outP + maxchars; // loop until done with input, or out of space while ((srcP < endP) && (outP < outEnd)) { // ascii subset is simple if (*srcP <= 127) { *outP++ = *srcP++; continue; } // how many trailing src bytes will this sequence require const uint32_t trailingBytes = gUTFBytes[*srcP]; // if not enough source to do an entire character, break out // before processing the current one if (srcP + trailingBytes >= endP) break; // start constructing the value. // Don't entirely trust encoding -- do validation checks // test first byte if((gUTFByteIndicatorTest[trailingBytes] & *srcP) != gUTFByteIndicator[trailingBytes]) { NsUtil::nsThrowException(XmlException::INVALID_VALUE, "nsFromUTF8: bad utf-8 encoding", __FILE__, __LINE__); } /*** * See http://www.unicode.org/reports/tr27/ * * Table 3.1B. lists all of the byte sequences that * are legal in UTF-8. A range of byte values such as * A0..BF indicates that any byte from A0 to BF (inclusive) * is legal in that position. * Any byte value outside of the ranges listed is illegal. * * For example,the byte sequence is illegal * since C0 is not legal in the 1st Byte column. * * The byte sequence is illegal since in the row * where E0 is legal as a first byte, 9F is not legal * as a second byte. * * The byte sequence is legal, * since every byte in that sequence matches * a byte range in a row of the table (the last row). * * The table is reproduced here for reference * * Table 3.1B. Legal UTF-8 Byte Sequences * Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte * ========================================================== * U+0000..U+007F 00..7F * ---------------------------------------------------------- * U+0080..U+07FF C2..DF 80..BF * * ---------------------------------------------------------- * U+0800..U+0FFF E0 A0..BF 80..BF * -- * * U+1000..U+FFFF E1..EF 80..BF 80..BF * * ---------------------------------------------------------- * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF * -- * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF * -- * ========================================================== * * Cases where a trailing byte range is not 80..BF are * underlined in the table to draw attention to them. * These occur only in the second byte of a sequence. */ uint32_t tmpVal = 0; switch(trailingBytes) { case 1 : // UTF-8: [110y yyyy] [10xx xxxx] // Unicode: [0000 0yyy] [yyxx xxxx] // // 0xC0, 0xC1 has been filtered out checkTrailingBytes(*(srcP+1)); tmpVal = *srcP++; tmpVal <<= 6; tmpVal += *srcP++; break; case 2 : // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] // Unicode: [zzzz yyyy] [yyxx xxxx] // if (( *srcP == 0xE0) && ( *(srcP+1) < 0xA0)) { NsUtil::nsThrowException( XmlException::INVALID_VALUE, "nsFromUTF8: bad utf-8 encoding", __FILE__, __LINE__); } checkTrailingBytes(*(srcP+1)); checkTrailingBytes(*(srcP+2)); // // See D36 (a)-(c) of // http://www.unicode.org/reports/tr27/ // //irregular three bytes sequence // that is zzzzyy matches leading surrogate tag // 110110 or trailing surrogate tag 110111 // // *srcP=1110 1101 // *(srcP+1)=1010 yyyy or // *(srcP+1)=1011 yyyy // // 0xED 1110 1101 // 0xA0 1010 0000 if ((*srcP == 0xED) && (*(srcP+1) >= 0xA0)) { NsUtil::nsThrowException( XmlException::INVALID_VALUE, "nsFromUTF8: bad utf-8 encoding", __FILE__, __LINE__); } tmpVal = *srcP++; tmpVal <<= 6; tmpVal += *srcP++; tmpVal <<= 6; tmpVal += *srcP++; break; case 3 : // UTF-8:[1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) // [1101 11yy] [yyxx xxxx] (low surrogate) // * uuuuu = wwww + 1 // if (((*srcP == 0xF0) && (*(srcP+1) < 0x90)) || ((*srcP == 0xF4) && (*(srcP+1) > 0x8F))) { NsUtil::nsThrowException( XmlException::INVALID_VALUE, "nsFromUTF8: bad utf-8 encoding", __FILE__, __LINE__); } checkTrailingBytes(*(srcP+1)); checkTrailingBytes(*(srcP+2)); checkTrailingBytes(*(srcP+3)); tmpVal = *srcP++; tmpVal <<= 6; tmpVal += *srcP++; tmpVal <<= 6; tmpVal += *srcP++; tmpVal <<= 6; tmpVal += *srcP++; break; default: // trailingBytes > 3 /*** * The definition of UTF-8 in Annex D of ISO/IEC * 10646-1:2000 also allows for the use of five- and * six-byte sequences to encode characters that are * outside the range of the Unicode character set; * those five- and six-byte sequences are illegal * for the use of UTF-8 as a transformation of Unicode * characters. ISO/IEC 10646 does not allow mapping * of unpaired surrogates, nor U+FFFE and * U+FFFF (but it does allow other noncharacters). ***/ NsUtil::nsThrowException( XmlException::INVALID_VALUE, "nsFromUTF8: bad utf-8 encoding", __FILE__, __LINE__); break; } tmpVal -= gUTFOffsets[trailingBytes]; // // If it will fit into a single char, then put it in. Otherwise // encode it as a surrogate pair. If its not valid, use the // replacement char. // if (!(tmpVal & 0xFFFF0000)) { *outP++ = (xmlch_t) tmpVal; } else if (tmpVal > 0x10FFFF) { NsUtil::nsThrowException( XmlException::INVALID_VALUE, "nsFromUTF8: bad utf-8 encoding", __FILE__, __LINE__); } else { // if there's not enough room for all chars, // break out. if (outP + 1 >= outEnd) break; // this char never happened // Store the leading surrogate char tmpVal -= 0x10000; *outP++ = (xmlch_t)((tmpVal >> 10) + 0xD800); // the trailing char *outP++ = (xmlch_t)((tmpVal & 0x3FF) + 0xDC00); } } // Update the bytes eaten eaten = (uint32_t)(srcP - src); // Return the characters read return (int)(outP - dest); } /* * transcode from UTF-8 to UTF-16. Return number of UTF-16 * characters that resulted, including NULLs, if any are included * in nbytes. */ int NsUtil::nsFromUTF8(xmlch_t **dest, const xmlbyte_t *src, uint32_t nbytes, uint32_t maxchars) { uint32_t bufsize; xmlch_t *outPtr = *dest; uint32_t eaten; if (!outPtr) { bufsize = nbytes << 1; // multiply by 2 // caller needs to know if using manager or new for allocation outPtr = (xmlch_t *) NsUtil::allocate(bufsize); *dest = outPtr; } if (!maxchars) maxchars = nbytes; return _nsFromUTF8(outPtr, src, nbytes, maxchars, eaten); } int NsUtil::nsStringLen(const xmlch_t *str) { int ret = 0; while (*str++ != 0) ret++; return ret; } void NsUtil::nsThrowException(XmlException::ExceptionCode code, const char *desc, const char *file, int line) { throw XmlException(code, desc, file, line); } void *NsUtil::allocate(size_t size) { void *ret = ::malloc(size); if (!ret) { nsThrowException(XmlException::NO_MEMORY_ERROR, "allocation failed"); } return ret; } void NsUtil::deallocate(void *obj) { ::free(obj); }