/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * $Id$ */ // --------------------------------------------------------------------------- // Includes // --------------------------------------------------------------------------- #include #include #include #include #include #include #include #include #include XERCES_CPP_NAMESPACE_BEGIN // --------------------------------------------------------------------------- // XMLReader: Query Methods // --------------------------------------------------------------------------- // Checks whether all of the chars in the passed buffer are whitespace or // not. Breaks out on the first non-whitespace. // bool XMLReader::isAllSpaces(const XMLCh* const toCheck , const XMLSize_t count) const { const XMLCh* curCh = toCheck; const XMLCh* endPtr = toCheck + count; while (curCh < endPtr) { if (!(fgCharCharsTable[*curCh++] & gWhitespaceCharMask)) return false; } return true; } // // Checks whether at least one of the chars in the passed buffer are whitespace or // not. // bool XMLReader::containsWhiteSpace(const XMLCh* const toCheck , const XMLSize_t count) const { const XMLCh* curCh = toCheck; const XMLCh* endPtr = toCheck + count; while (curCh < endPtr) { if (fgCharCharsTable[*curCh++] & gWhitespaceCharMask) return true; } return false; } // // This one is not called terribly often, so call the XMLChar utility // bool XMLReader::isPublicIdChar(const XMLCh toCheck) const { if (fXMLVersion == XMLV1_1) return XMLChar1_1::isPublicIdChar(toCheck); else return XMLChar1_0::isPublicIdChar(toCheck); } // --------------------------------------------------------------------------- // XMLReader: Constructors and Destructor // --------------------------------------------------------------------------- XMLReader::XMLReader(const XMLCh* const pubId , const XMLCh* const sysId , BinInputStream* const streamToAdopt , const RefFrom from , const Types type , const Sources source , const bool throwAtEnd , const bool calculateSrcOfs , XMLSize_t lowWaterMark , const XMLVersion version , MemoryManager* const manager) : fCharIndex(0) , fCharsAvail(0) , fCurCol(1) , fCurLine(1) , fEncodingStr(0) , fForcedEncoding(false) , fNoMore(false) , fPublicId(XMLString::replicate(pubId, manager)) , fRawBufIndex(0) , fRawBytesAvail(0) , fLowWaterMark (lowWaterMark) , fReaderNum(0xFFFFFFFF) , fRefFrom(from) , fSentTrailingSpace(false) , fSource(source) , fSrcOfsBase(0) , fSrcOfsSupported(false) , fCalculateSrcOfs(calculateSrcOfs) , fSystemId(XMLString::replicate(sysId, manager)) , fStream(streamToAdopt) , fSwapped(false) , fThrowAtEnd(throwAtEnd) , fTranscoder(0) , fType(type) , fMemoryManager(manager) { setXMLVersion(version); // Do an initial load of raw bytes refreshRawBuffer(); // Ask the transcoding service if it supports src offset info fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs(); // // Use the recognizer class to get a basic sense of what family of // encodings this file is in. We'll start off with a reader of that // type, and update it later if needed when we read the XMLDecl line. // fEncoding = XMLRecognizer::basicEncodingProbe(fRawByteBuf, fRawBytesAvail); #if defined(XERCES_DEBUG) if ((fEncoding < XMLRecognizer::Encodings_Min) || (fEncoding > XMLRecognizer::Encodings_Max)) { ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager); } #endif fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding, fMemoryManager), fMemoryManager); // Check whether the fSwapped flag should be set or not checkForSwapped(); // // This will check to see if the first line is an XMLDecl and, if // so, decode that first line manually one character at a time. This // leaves enough characters in the buffer that the high level code // can get through the Decl and call us back with the real encoding. // doInitDecode(); // // NOTE: We won't create a transcoder until we either get a call to // setEncoding() or we get a call to refreshCharBuffer() and no // transcoder has been set yet. // } XMLReader::XMLReader(const XMLCh* const pubId , const XMLCh* const sysId , BinInputStream* const streamToAdopt , const XMLCh* const encodingStr , const RefFrom from , const Types type , const Sources source , const bool throwAtEnd , const bool calculateSrcOfs , XMLSize_t lowWaterMark , const XMLVersion version , MemoryManager* const manager) : fCharIndex(0) , fCharsAvail(0) , fCurCol(1) , fCurLine(1) , fEncoding(XMLRecognizer::UTF_8) , fEncodingStr(0) , fForcedEncoding(true) , fNoMore(false) , fPublicId(XMLString::replicate(pubId, manager)) , fRawBufIndex(0) , fRawBytesAvail(0) , fLowWaterMark (lowWaterMark) , fReaderNum(0xFFFFFFFF) , fRefFrom(from) , fSentTrailingSpace(false) , fSource(source) , fSrcOfsBase(0) , fSrcOfsSupported(false) , fCalculateSrcOfs(calculateSrcOfs) , fSystemId(XMLString::replicate(sysId, manager)) , fStream(streamToAdopt) , fSwapped(false) , fThrowAtEnd(throwAtEnd) , fTranscoder(0) , fType(type) , fMemoryManager(manager) { setXMLVersion(version); // Do an initial load of raw bytes refreshRawBuffer(); // Copy the encoding string to our member fEncodingStr = XMLString::replicate(encodingStr, fMemoryManager); XMLString::upperCaseASCII(fEncodingStr); // Ask the transcoding service if it supports src offset info fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs(); // // Map the passed encoding name to one of our enums. If it does not // match one of the intrinsic encodings, it will come back 'other', // which tells us to create a transcoder based reader. // fEncoding = XMLRecognizer::encodingForName(fEncodingStr); // test the presence of the BOM and remove it from the source switch(fEncoding) { case XMLRecognizer::UCS_4B : case XMLRecognizer::UCS_4L : { if (fRawBytesAvail > 4 && (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) || ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00))) ) { fRawBufIndex += 4; } break; } case XMLRecognizer::UTF_8 : { // Look at the raw buffer as short chars const char* asChars = (const char*)fRawByteBuf; if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen && XMLString::compareNString( asChars , XMLRecognizer::fgUTF8BOM , XMLRecognizer::fgUTF8BOMLen) == 0) { fRawBufIndex += XMLRecognizer::fgUTF8BOMLen; } break; } case XMLRecognizer::UTF_16B : case XMLRecognizer::UTF_16L : { if (fRawBytesAvail < 2) break; const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex]; if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker)) { fRawBufIndex += sizeof(UTF16Ch); } break; } case XMLRecognizer::EBCDIC: case XMLRecognizer::US_ASCII: case XMLRecognizer::XERCES_XMLCH: case XMLRecognizer::OtherEncoding: case XMLRecognizer::Encodings_Count: { // silence warning about enumeration not being used break; } } // Check whether the fSwapped flag should be set or not checkForSwapped(); // // Create a transcoder for the encoding. Since the encoding has been // forced, this will be the one we will use, period. // XMLTransService::Codes failReason; if (fEncoding == XMLRecognizer::OtherEncoding) { // // fEncodingStr not pre-recognized, use it // directly for transcoder // fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor ( fEncodingStr , failReason , kCharBufSize , fMemoryManager ); } else { // // Use the recognized fEncoding to create the transcoder // fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor ( fEncoding , failReason , kCharBufSize , fMemoryManager ); } if (!fTranscoder) { // We are about to throw which means the d-tor won't be called. // Clean up some memory. // fMemoryManager->deallocate(fPublicId); fMemoryManager->deallocate(fSystemId); ArrayJanitor jan (fEncodingStr, fMemoryManager); ThrowXMLwithMemMgr1 ( TranscodingException , XMLExcepts::Trans_CantCreateCvtrFor , fEncodingStr , fMemoryManager ); } // // Note that, unlike above, we do not do an initial decode of the // first line. We take the caller's word that the encoding is correct // and just assume that the first bulk decode (kicked off by the first // get of a character) will work. // // So we do here the slipping in of the leading space if required. // if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral)) { // This represents no data from the source fCharSizeBuf[fCharsAvail] = 0; fCharOfsBuf[fCharsAvail] = 0; fCharBuf[fCharsAvail++] = chSpace; } } XMLReader::XMLReader(const XMLCh* const pubId , const XMLCh* const sysId , BinInputStream* const streamToAdopt , XMLRecognizer::Encodings encodingEnum , const RefFrom from , const Types type , const Sources source , const bool throwAtEnd , const bool calculateSrcOfs , XMLSize_t lowWaterMark , const XMLVersion version , MemoryManager* const manager) : fCharIndex(0) , fCharsAvail(0) , fCurCol(1) , fCurLine(1) , fEncoding(XMLRecognizer::UTF_8) , fEncodingStr(0) , fForcedEncoding(true) , fNoMore(false) , fPublicId(XMLString::replicate(pubId, manager)) , fRawBufIndex(0) , fRawBytesAvail(0) , fLowWaterMark (lowWaterMark) , fReaderNum(0xFFFFFFFF) , fRefFrom(from) , fSentTrailingSpace(false) , fSource(source) , fSrcOfsBase(0) , fSrcOfsSupported(false) , fCalculateSrcOfs(calculateSrcOfs) , fSystemId(XMLString::replicate(sysId, manager)) , fStream(streamToAdopt) , fSwapped(false) , fThrowAtEnd(throwAtEnd) , fTranscoder(0) , fType(type) , fMemoryManager(manager) { setXMLVersion(version); // Do an initial load of raw bytes refreshRawBuffer(); // Ask the transcoding service if it supports src offset info fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs(); // // Use the passed encoding code // fEncoding = encodingEnum; fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding, fMemoryManager), fMemoryManager); // Check whether the fSwapped flag should be set or not checkForSwapped(); // // Create a transcoder for the encoding. Since the encoding has been // forced, this will be the one we will use, period. // XMLTransService::Codes failReason; fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor ( fEncoding , failReason , kCharBufSize , fMemoryManager ); if (!fTranscoder) { // We are about to throw which means the d-tor won't be called. // Clean up some memory. // fMemoryManager->deallocate(fPublicId); fMemoryManager->deallocate(fSystemId); ArrayJanitor jan (fEncodingStr, fMemoryManager); ThrowXMLwithMemMgr1 ( TranscodingException , XMLExcepts::Trans_CantCreateCvtrFor , fEncodingStr , fMemoryManager ); } // // Note that, unlike above, we do not do an initial decode of the // first line. We take the caller's word that the encoding is correct // and just assume that the first bulk decode (kicked off by the first // get of a character) will work. // // So we do here the slipping in of the leading space if required. // if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral)) { // This represents no data from the source fCharSizeBuf[fCharsAvail] = 0; fCharOfsBuf[fCharsAvail] = 0; fCharBuf[fCharsAvail++] = chSpace; } } XMLReader::~XMLReader() { fMemoryManager->deallocate(fEncodingStr); fMemoryManager->deallocate(fPublicId); fMemoryManager->deallocate(fSystemId); delete fStream; delete fTranscoder; } // --------------------------------------------------------------------------- // XMLReader: Character buffer management methods // --------------------------------------------------------------------------- XMLFilePos XMLReader::getSrcOffset() const { if (!fSrcOfsSupported || !fCalculateSrcOfs) ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_SrcOfsNotSupported, fMemoryManager); // // Take the current source offset and add in the sizes that we've // eaten from the source so far. // if( fCharIndex == 0 ) { return fSrcOfsBase; } if( fCharIndex < fCharsAvail ) { return (fSrcOfsBase + fCharOfsBuf[fCharIndex]); } return (fSrcOfsBase + fCharOfsBuf[fCharIndex-1] + fCharSizeBuf[fCharIndex-1]); } bool XMLReader::refreshCharBuffer() { // If the no more flag is set, then don't bother doing anything. if (fNoMore) return false; XMLSize_t startInd; // See if we have any existing chars. const XMLSize_t spareChars = fCharsAvail - fCharIndex; // If we are full, then don't do anything. if (spareChars == kCharBufSize) return true; // // If no transcoder has been created yet, then we never saw the // any encoding="" string and the encoding was not forced, so lets // create one now. We know that it won't change now. // // However, note that if we autosensed EBCDIC, then we have to // consider it an error if we never got an encoding since we don't // know what variant of EBCDIC it is. // if (!fTranscoder) { if (fEncoding == XMLRecognizer::EBCDIC) ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_EncodingStrRequired, fMemoryManager); // Ask the transcoding service to make use a transcoder XMLTransService::Codes failReason; fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor ( fEncodingStr , failReason , kCharBufSize , fMemoryManager ); if (!fTranscoder) { ThrowXMLwithMemMgr1 ( TranscodingException , XMLExcepts::Trans_CantCreateCvtrFor , fEncodingStr , fMemoryManager ); } } // // Add the number of source bytes eaten so far to the base src // offset member. // if (fCalculateSrcOfs) { for (startInd = 0; startInd < fCharIndex; startInd++) fSrcOfsBase += fCharSizeBuf[startInd]; } // // If there are spare chars, then move then down to the bottom. We // have to move the char sizes down also. // startInd = 0; if (spareChars) { for (XMLSize_t index = fCharIndex; index < fCharsAvail; index++) { fCharBuf[startInd] = fCharBuf[index]; fCharSizeBuf[startInd] = fCharSizeBuf[index]; startInd++; } } // // And then get more chars, starting after any spare chars that were // left over from the last time. // fCharsAvail = xcodeMoreChars ( &fCharBuf[startInd] , &fCharSizeBuf[startInd] , kCharBufSize - spareChars ); // Add back in the spare chars fCharsAvail += spareChars; // Reset the buffer index to zero, so we start from the 0th char again fCharIndex = 0; // // If no chars available, then we have to check for one last thing. If // this is reader for a PE and its not being expanded inside a literal, // then unget a trailing space. We use a boolean to avoid triggering // this more than once. // if (!fCharsAvail && (fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral) && !fSentTrailingSpace) { fCharBuf[0] = chSpace; fCharsAvail = 1; fSentTrailingSpace = true; } // // If we get here with no more chars, then set the fNoMore flag which // lets us optimize and know without checking that no more chars are // available. // if (!fCharsAvail) fNoMore = true; // Calculate fCharOfsBuf using the elements from fCharBufSize if (fCalculateSrcOfs) { unsigned int last = 0; fCharOfsBuf[0] = 0; for (XMLSize_t index = 1; index < fCharsAvail; ++index) { fCharOfsBuf[index] = last+fCharSizeBuf[index-1]; last = fCharOfsBuf[index]; // code was: // fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1]; // but on Solaris 64 bit with sun studio 11 this didn't work as // every value of fCharOfsBuf[] was 1. } } return (fCharsAvail != 0); } // --------------------------------------------------------------------------- // XMLReader: Scanning methods // --------------------------------------------------------------------------- bool XMLReader::getName(XMLBuffer& toFill, const bool token) { // Ok, first lets see if we have chars in the buffer. If not, then lets // reload. if (fCharIndex == fCharsAvail) { if (!refreshCharBuffer()) return false; } XMLSize_t charIndex_start = fCharIndex; // Lets check the first char for being a first name char. If not, then // what's the point in living mannnn? Just give up now. We only do this // if its a name and not a name token that they want. if (!token) { if (fXMLVersion == XMLV1_1 && ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F))) { // make sure one more char is in the buffer, the transcoder // should put only a complete surrogate pair into the buffer assert(fCharIndex+1 < fCharsAvail); if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF)) return false; // Looks ok, so lets eat it fCharIndex += 2; } else { if (!isFirstNameChar(fCharBuf[fCharIndex])) return false; // Looks ok, so lets eat it fCharIndex ++; } } // And now we loop until we run out of data in this reader or we hit // a non-name char. while (true) { if (fXMLVersion == XMLV1_1) { while (fCharIndex < fCharsAvail) { // Check the current char and take it if its a name char. Else // break out. if ( (fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) ) { // make sure one more char is in the buffer, the transcoder // should put only a complete surrogate pair into the buffer assert(fCharIndex+1 < fCharsAvail); if ( (fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF) ) break; fCharIndex += 2; } else { if (!isNameChar(fCharBuf[fCharIndex])) break; fCharIndex++; } } } else // XMLV1_0 { while (fCharIndex < fCharsAvail) { if (!isNameChar(fCharBuf[fCharIndex])) break; fCharIndex++; } } // we have to copy the accepted character(s), and update column if (fCharIndex != charIndex_start) { fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start); toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start); } // something is wrong if there is still something in the buffer // or if we don't get no more, then break out. if ((fCharIndex < fCharsAvail) || !refreshCharBuffer()) break; charIndex_start = fCharIndex; } return !toFill.isEmpty(); } bool XMLReader::getNCName(XMLBuffer& toFill) { if (fCharIndex == fCharsAvail && !refreshCharBuffer()) return false; XMLSize_t charIndex_start = fCharIndex, count; // Lets check the first char for being a first name char. If not, then // what's the point in living mannnn? Just give up now. We only do this // if its a name and not a name token that they want. if (fXMLVersion == XMLV1_1 && ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F))) { // make sure one more char is in the buffer, the transcoder // should put only a complete surrogate pair into the buffer assert(fCharIndex+1 < fCharsAvail); if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF)) return false; // Looks ok, so lets eat it fCharIndex += 2; } else { if (!isFirstNCNameChar(fCharBuf[fCharIndex])) { return false; } // Looks ok, so lets eat it fCharIndex++; } do { if (fCharIndex == fCharsAvail) { // we have to copy the accepted character(s), and update the column number, // before getting new data and losing the value of fCharIndex if((count = fCharIndex - charIndex_start)!=0) { fCurCol += (XMLFileLoc)count; toFill.append(&fCharBuf[charIndex_start], count); } if(!refreshCharBuffer()) return true; charIndex_start = fCharIndex; } // Check the current char and take it if it's a name char if (fXMLVersion == XMLV1_1) { while(fCharIndex < fCharsAvail) { if(isNCNameChar(fCharBuf[fCharIndex])) fCharIndex++; else if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) && ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))) fCharIndex+=2; else break; } } else while(fCharIndex < fCharsAvail && isNCNameChar(fCharBuf[fCharIndex])) fCharIndex++; // if we didn't consume the entire buffer, we are done } while(fCharIndex == fCharsAvail); // we have to copy the accepted character(s), and update column if((count = fCharIndex - charIndex_start)!=0) { fCurCol += (XMLFileLoc)count; toFill.append(&fCharBuf[charIndex_start], count); } return true; } bool XMLReader::getQName(XMLBuffer& toFill, int* colonPosition) { // We are only looking for two iterations (i.e. 'NCNAME':'NCNAME'). // We will stop when we finished scanning for a QName (i.e. either a second // colon or an invalid char). if(!getNCName(toFill)) { *colonPosition = -1; return false; } if (fCharIndex == fCharsAvail && !refreshCharBuffer()) { *colonPosition = -1; return true; } if (fCharBuf[fCharIndex] != chColon) { *colonPosition = -1; return true; } *colonPosition = (int)toFill.getLen(); toFill.append(chColon); fCharIndex++; fCurCol++; return getNCName(toFill); } bool XMLReader::getSpaces(XMLBuffer& toFill) { // // We just loop until we either hit a non-space or the end of this // entity. We return true if we returned because of a non-space and // false if because of end of entity. // // NOTE: We have to maintain line/col info here and we have to do // whitespace normalization if we are not already internalized. // while (true) { // Loop through the current chars in the buffer while (fCharIndex < fCharsAvail) { // Get the current char out of the buffer XMLCh curCh = fCharBuf[fCharIndex]; // // See if its a white space char. If so, then process it. Else // we've hit a non-space and need to return. // if (isWhitespace(curCh)) { // Eat this char fCharIndex++; // // 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have // end-of-line combinations with a leading chCR(xD) or chLF(xA) // // 100000 x20 // 001001 x9 // 001010 chLF // 001101 chCR // ----------- // 000110 == (chCR|chLF) & ~(0x9|0x20) // // if the result of thelogical-& operation is // true : 'curCh' must be xA or xD // false : 'curCh' must be x20 or x9 // if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 ) { fCurCol++; } else { handleEOL(curCh, false); } // Ok we can add this guy to our buffer toFill.append(curCh); } else { // Return true to indicate we broke out due to a whitespace return true; } } // // We've eaten up the current buffer, so lets try to reload it. If // we don't get anything new, then break out. If we do, then we go // back to the top to keep getting spaces. // if (!refreshCharBuffer()) break; } return false; } bool XMLReader::getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck) { while (true) { // Loop through the current chars in the buffer while (fCharIndex < fCharsAvail) { // Get the current char out of the buffer XMLCh curCh = fCharBuf[fCharIndex]; // // See if its not a white space or our target char, then process // it. Else, we need to return. // if (!isWhitespace(curCh) && (curCh != toCheck)) { // Eat this char fCharIndex++; // // 'curCh' is not a whitespace(x20|x9|xD|xA), so we only can // have end-of-line combinations with a leading chNEL(x85) or // chLineSeparator(x2028) // // 0010000000101000 chLineSeparator // 0000000010000101 chNEL // --------------------- // 1101111101010010 == ~(chNEL|chLineSeparator) // // if the result of the logical-& operation is // true : 'curCh' can not be chNEL or chLineSeparator // false : 'curCh' can be chNEL or chLineSeparator // if ( curCh & (XMLCh) ~(chNEL|chLineSeparator) ) { fCurCol++; } else { handleEOL(curCh, false); } // Add it to our buffer toFill.append(curCh); } else { return true; } } // // We've eaten up the current buffer, so lets try to reload it. If // we don't get anything new, then break out. If we do, then we go // back to the top to keep getting spaces. // if (!refreshCharBuffer()) break; } // We never hit any non-space and ate up the whole reader return false; } bool XMLReader::skipIfQuote(XMLCh& chGotten) { if (fCharIndex == fCharsAvail && !refreshCharBuffer()) return false; chGotten = fCharBuf[fCharIndex]; if ((chGotten == chDoubleQuote) || (chGotten == chSingleQuote)) { fCharIndex++; fCurCol++; return true; } return false; } bool XMLReader::skipSpaces(bool& skippedSomething, bool inDecl) { // DO NOT set the skippedSomething to 'false', but change it to be 'true' only // We enter a loop where we skip over spaces until we hit the end of // this reader or a non-space value. The return indicates whether we // hit the non-space (true) or the end (false). do { // Loop through the current chars in the buffer while (fCharIndex < fCharsAvail) { // See if its a white space char. If so, then process it. Else // we've hit a non-space and need to return. if (isWhitespace(fCharBuf[fCharIndex])) { // Get the current char out of the buffer and eat it XMLCh curCh = fCharBuf[fCharIndex++]; skippedSomething = true; // // 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have // end-of-line combinations with a leading chCR(xD) or chLF(xA) // // 100000 x20 // 001001 x9 // 001010 chLF // 001101 chCR // ----------- // 000110 == (chCR|chLF) & ~(0x9|0x20) // // if the result of the logical-& operation is // true : 'curCh' must be xA or xD // false : 'curCh' must be x20 or x9 // if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 ) { fCurCol++; } else { handleEOL(curCh, inDecl); } } else return true; } // We've eaten up the current buffer, so lets try to reload it. If // we don't get anything new, then break out. If we do, then we go // back to the top to keep getting spaces. } while(refreshCharBuffer()); // We never hit any non-space and ate up the whole reader return false; } bool XMLReader::skippedChar(const XMLCh toSkip) { // // If the buffer is empty, then try to reload it. If we still get // nothing, then return false. // if (fCharIndex == fCharsAvail) { if (!refreshCharBuffer()) return false; } // // See if the current char is the one we want. If so, then we need // to eat it and return true. // if (fCharBuf[fCharIndex] == toSkip) { fCharIndex++; fCurCol++; return true; } return false; } bool XMLReader::skippedSpace() { // // If the buffer is empty, then try to reload it. If we still get // nothing, then return false. // if (fCharIndex == fCharsAvail) { if (!refreshCharBuffer()) return false; } // // See if the current char is a whitespace. If so, then we need to eat // it and return true. // const XMLCh curCh = fCharBuf[fCharIndex]; if (isWhitespace(curCh)) { // Eat the character fCharIndex++; // // 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have // end-of-line combinations with a leading chCR(xD) or chLF(xA) // // 100000 x20 // 001001 x9 // 001010 chLF // 001101 chCR // ----------- // 000110 == (chCR|chLF) & ~(0x9|0x20) // // if the result of the logical-& operation is // true : 'curCh' must be xA or xD // false : 'curCh' must be x20 or x9 // if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 ) { fCurCol++; } else { handleEOL((XMLCh&)curCh, false); } return true; } return false; } bool XMLReader::skippedString(const XMLCh* const toSkip) { // This function works on strings that are smaller than kCharBufSize. // This function guarantees that in case the comparison is unsuccessful // the fCharIndex will point to the original data. // // Get the length of the string to skip. // const XMLSize_t srcLen = XMLString::stringLen(toSkip); XMLSize_t charsLeft = charsLeftInBuffer(); // See if the current reader has enough chars to test against this // string. If not, then ask it to reload its buffer. If that does not // get us enough, then it cannot match. // // NOTE: This works because strings never have to cross a reader! And // a string to skip will never have a new line in it, so we will never // miss adjusting the current line. // while (charsLeft < srcLen) { if (!refreshCharBuffer()) return false; XMLSize_t tmp = charsLeftInBuffer(); if (tmp == charsLeft) // if the refreshCharBuf() did not add anything new return false; // give up and return. charsLeft = tmp; } // Ok, now we now that the current reader has enough chars in its // buffer and that its index is back at zero. So we can do a quick and // dirty comparison straight to its buffer with no requirement to unget // if it fails. // if (memcmp(&fCharBuf[fCharIndex], toSkip, srcLen * sizeof(XMLCh))) return false; // Add the source length to the current column to get it back right. // fCurCol += (XMLFileLoc)srcLen; // And get the character buffer index back right by just adding the // source len to it. // fCharIndex += srcLen; return true; } bool XMLReader::skippedStringLong(const XMLCh* toSkip) { // This function works on strings that are potentially longer than // kCharBufSize (e.g., end tag). This function does not guarantee // that in case the comparison is unsuccessful the fCharIndex will // point to the original data. // XMLSize_t srcLen = XMLString::stringLen(toSkip); XMLSize_t charsLeft = charsLeftInBuffer(); while (srcLen != 0) { // Fill up the buffer with as much data as possible. // while (charsLeft < srcLen && charsLeft != kCharBufSize) { if (!refreshCharBuffer()) return false; XMLSize_t tmp = charsLeftInBuffer(); if (tmp == charsLeft) // if the refreshCharBuf() did not add anything return false; // new give up and return. charsLeft = tmp; } XMLSize_t n = charsLeft < srcLen ? charsLeft : srcLen; if (memcmp(&fCharBuf[fCharIndex], toSkip, n * sizeof(XMLCh))) return false; toSkip += n; srcLen -= n; fCharIndex += n; fCurCol += (XMLFileLoc)n; charsLeft -= n; } return true; } // // This is just to peek if the next coming buffer // matches the string toPeek. // Similar to skippedString, but just the fCharIndex and fCurCol are not updated // bool XMLReader::peekString(const XMLCh* const toPeek) { // Get the length of the string to skip const XMLSize_t srcLen = XMLString::stringLen(toPeek); // // See if the current reader has enough chars to test against this // string. If not, then ask it to reload its buffer. If that does not // get us enough, then it cannot match. // // NOTE: This works because strings never have to cross a reader! And // a string to skip will never have a new line in it, so we will never // miss adjusting the current line. // XMLSize_t charsLeft = charsLeftInBuffer(); while (charsLeft < srcLen) { refreshCharBuffer(); XMLSize_t t = charsLeftInBuffer(); if (t == charsLeft) // if the refreshCharBuf() did not add anything new return false; // give up and return. charsLeft = t; } // // Ok, now we now that the current reader has enough chars in its // buffer and that its index is back at zero. So we can do a quick and // dirty comparison straight to its buffer with no requirement to unget // if it fails. // if (memcmp(&fCharBuf[fCharIndex], toPeek, srcLen*sizeof(XMLCh))) return false; return true; } // --------------------------------------------------------------------------- // XMLReader: Setter methods (most are inlined) // --------------------------------------------------------------------------- bool XMLReader::setEncoding(const XMLCh* const newEncoding) { // // If the encoding was forced, then we ignore the new value and just // return with success. If it was forced, then we are to use that // encoding without question. Note that, if we are forced, we created // a transcoder up front so there is no need to do one here in that // case. // if (fForcedEncoding) return true; // // upperCase the newEncoding first for better performance // XMLCh* inputEncoding = XMLString::replicate(newEncoding, fMemoryManager); XMLString::upperCaseASCII(inputEncoding); XMLRecognizer::Encodings newBaseEncoding; // // Check for non-endian specific UTF-16 or UCS-4. If so, and if we // are already in one of the endian versions of those encodings, // then just keep it and go on. Otherwise, its not valid. // if (XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString) || XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString2) || XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString3) || XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString4) || XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString5) || XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString6) || XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString7)) { fMemoryManager->deallocate(inputEncoding); if ((fEncoding != XMLRecognizer::UTF_16L) && (fEncoding != XMLRecognizer::UTF_16B)) { return false; } // Override with the original endian specific encoding newBaseEncoding = fEncoding; if (fEncoding == XMLRecognizer::UTF_16L) { fMemoryManager->deallocate(fEncodingStr); fEncodingStr = 0; fEncodingStr = XMLString::replicate(XMLUni::fgUTF16LEncodingString, fMemoryManager); } else { fMemoryManager->deallocate(fEncodingStr); fEncodingStr = 0; fEncodingStr = XMLString::replicate(XMLUni::fgUTF16BEncodingString, fMemoryManager); } } else if (XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString) || XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString2) || XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString3) || XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString4) || XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString5)) { fMemoryManager->deallocate(inputEncoding); if ((fEncoding != XMLRecognizer::UCS_4L) && (fEncoding != XMLRecognizer::UCS_4B)) { return false; } // Override with the original endian specific encoding newBaseEncoding = fEncoding; if (fEncoding == XMLRecognizer::UCS_4L) { fMemoryManager->deallocate(fEncodingStr); fEncodingStr = 0; fEncodingStr = XMLString::replicate(XMLUni::fgUCS4LEncodingString, fMemoryManager); } else { fMemoryManager->deallocate(fEncodingStr); fEncodingStr = 0; fEncodingStr = XMLString::replicate(XMLUni::fgUCS4BEncodingString, fMemoryManager); } } else { // // Try to map the string to one of our standard encodings. If its not // one of them, then it has to be one of the non-intrinsic encodings, // in which case we have to delete our intrinsic encoder and create a // new one. // newBaseEncoding = XMLRecognizer::encodingForName(inputEncoding); // // If it does not come back as one of the auto-sensed encodings, then we // have to possibly replace it and at least check a few things. // if (newBaseEncoding == XMLRecognizer::OtherEncoding) { // // We already know it's none of those non-endian special cases, // so just replicate the new name and use it directly to create the transcoder // fMemoryManager->deallocate(fEncodingStr); fEncodingStr = inputEncoding; XMLTransService::Codes failReason; fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor ( fEncodingStr , failReason , kCharBufSize , fMemoryManager ); } else { // Store the new encoding string since it is just an intrinsic fMemoryManager->deallocate(fEncodingStr); fEncodingStr = inputEncoding; } } if (!fTranscoder) { // // Now we can create a transcoder using the recognized fEncoding. We // might get back a transcoder for an intrinsically supported encoding, // or we might get one from the underlying transcoding service. // XMLTransService::Codes failReason; fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor ( newBaseEncoding , failReason , kCharBufSize , fMemoryManager ); if (!fTranscoder) ThrowXMLwithMemMgr1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr, fMemoryManager); } // Update the base encoding member with the new base encoding found fEncoding = newBaseEncoding; // Looks ok to us return true; } // --------------------------------------------------------------------------- // XMLReader: Private helper methods // --------------------------------------------------------------------------- // // This is called when the encoding flag is set and just sets the fSwapped // flag appropriately. // void XMLReader::checkForSwapped() { // Assume not swapped fSwapped = false; if (XMLPlatformUtils::fgXMLChBigEndian) { if ((fEncoding == XMLRecognizer::UTF_16L) || (fEncoding == XMLRecognizer::UCS_4L)) { fSwapped = true; } } else { if ((fEncoding == XMLRecognizer::UTF_16B) || (fEncoding == XMLRecognizer::UCS_4B)) { fSwapped = true; } } } // // This is called from the constructor when the encoding is not forced. // We assume that the encoding has been auto-sensed at this point and that // fSwapped is set correctly. // // In the case of UCS-4 and EBCDIC, we don't have to check for a decl. // The fact that we got here, means that there is one, because that's the // only way we can autosense those. // void XMLReader::doInitDecode() { switch(fEncoding) { case XMLRecognizer::UCS_4B : case XMLRecognizer::UCS_4L : { // Remove bom if any if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) || ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00)) ) { for (XMLSize_t i = 0; i < fRawBytesAvail; i++) fRawByteBuf[i] = fRawByteBuf[i+4]; fRawBytesAvail -=4; } // Look at the raw buffer as UCS4 chars const UCS4Ch* asUCS = (const UCS4Ch*)fRawByteBuf; while (fRawBufIndex < fRawBytesAvail) { // Get out the current 4 byte value and inc our raw buf index UCS4Ch curVal = *asUCS++; fRawBufIndex += sizeof(UCS4Ch); // Swap if that is required for this machine if (fSwapped) curVal = BitOps::swapBytes(curVal); // Make sure its at least semi legal. If not, undo and throw if (curVal > 0xFFFF) { fCharsAvail = 0; fRawBufIndex = 0; fMemoryManager->deallocate(fPublicId); fMemoryManager->deallocate(fEncodingStr); ArrayJanitor janValue(fSystemId, fMemoryManager); ThrowXMLwithMemMgr1 ( TranscodingException , XMLExcepts::Reader_CouldNotDecodeFirstLine , fSystemId , fMemoryManager ); } // Convert the value to an XML char and store it fCharSizeBuf[fCharsAvail] = 4; fCharBuf[fCharsAvail++] = XMLCh(curVal); // Break out on the > character if (curVal == chCloseAngle) break; } break; } case XMLRecognizer::UTF_8 : { // If there's a utf-8 BOM (0xEF 0xBB 0xBF), skip past it. // Don't move to char buf - no one wants to see it. // Note: this causes any encoding= declaration to override // the BOM's attempt to say that the encoding is utf-8. // Look at the raw buffer as short chars const char* asChars = (const char*)fRawByteBuf; if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen && XMLString::compareNString( asChars , XMLRecognizer::fgUTF8BOM , XMLRecognizer::fgUTF8BOMLen) == 0) { fRawBufIndex += XMLRecognizer::fgUTF8BOMLen; asChars += XMLRecognizer::fgUTF8BOMLen; } // // First check that there are enough bytes to even see the // decl indentifier. If not, get out now with no action since // there is no decl. // if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen) break; // Check for the opening sequence. If not, then no decl if (XMLString::compareNString( asChars , XMLRecognizer::fgASCIIPre , XMLRecognizer::fgASCIIPreLen)) { break; } while (fRawBufIndex < fRawBytesAvail) { const char curCh = *asChars++; fRawBufIndex++; // Looks ok, so store it fCharSizeBuf[fCharsAvail] = 1; fCharBuf[fCharsAvail++] = XMLCh(curCh); // Break out on a > character if (curCh == chCloseAngle) break; // // A char greater than 0x7F is not allowed in this case. If // so, undo and throw. // if (curCh & 0x80) { fCharsAvail = 0; fRawBufIndex = 0; fMemoryManager->deallocate(fPublicId); fMemoryManager->deallocate(fEncodingStr); ArrayJanitor janValue(fSystemId, fMemoryManager); ThrowXMLwithMemMgr1 ( TranscodingException , XMLExcepts::Reader_CouldNotDecodeFirstLine , fSystemId , fMemoryManager ); } } break; } case XMLRecognizer::UTF_16B : case XMLRecognizer::UTF_16L : { // // If there is a decl here, we just truncate back the characters // as we go. No surrogate creation would be allowed here in legal // XML, so we consider it a transoding error if we find one. // if (fRawBytesAvail < 2) break; XMLSize_t postBOMIndex = 0; const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex]; if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker)) { fRawBufIndex += sizeof(UTF16Ch); asUTF16++; postBOMIndex = fRawBufIndex; } // First check that there are enough raw bytes for there to even // be a decl indentifier. If not, then nothing to do. // if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen) { fRawBufIndex = postBOMIndex; break; } // // See we get a match on the prefix. If not, then reset and // break out. // if (fEncoding == XMLRecognizer::UTF_16B) { if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen)) { fRawBufIndex = postBOMIndex; break; } } else { if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen)) { fRawBufIndex = postBOMIndex; break; } } while (fRawBufIndex < fRawBytesAvail) { // Get out the current 2 byte value UTF16Ch curVal = *asUTF16++; fRawBufIndex += sizeof(UTF16Ch); // Swap if that is required for this machine if (fSwapped) curVal = BitOps::swapBytes(curVal); // // Store it and bump the target index, implicitly converting // if UTF16Ch and XMLCh are not the same size. // fCharSizeBuf[fCharsAvail] = 2; fCharBuf[fCharsAvail++] = curVal; // Break out on a > char if (curVal == chCloseAngle) break; } break; } case XMLRecognizer::EBCDIC : { // // We use special support in the intrinsic EBCDIC-US transcoder // to go through one char at a time. // const XMLByte* srcPtr = fRawByteBuf; while (1) { // Transcode one char from the source const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++); fRawBufIndex++; // // And put it into the character buffer. This stuff has to // look like it was normally transcoded. // fCharSizeBuf[fCharsAvail] = 1; fCharBuf[fCharsAvail++] = chCur; // If its a > char, then break out if (chCur == chCloseAngle) break; // Watch for using up all input and get out if (fRawBufIndex == fRawBytesAvail) break; } break; } default : // It should never be anything else here fMemoryManager->deallocate(fPublicId); fMemoryManager->deallocate(fEncodingStr); fMemoryManager->deallocate(fSystemId); ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager); break; } // // Ok, by the time we get here, if its a legal XML file we have eaten // the XML/TextDecl. So, if we are a PE and are being referenced from // outside a literal, then we need to throw in an arbitrary space that // is required by XML. // if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral)) fCharBuf[fCharsAvail++] = chSpace; // Calculate fCharOfsBuf buffer using the elements from fCharBufSize if (fCalculateSrcOfs) { fCharOfsBuf[0] = 0; for (XMLSize_t index = 1; index < fCharsAvail; ++index) { fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1]; } } } // // This method is called internally when we run out of bytes in the raw // buffer. We just read as many bytes as we can into the raw buffer again // and store the number of bytes we got. // void XMLReader::refreshRawBuffer() { // // If there are any bytes left, move them down to the start. There // should only ever be (max bytes per char - 1) at the most. // const XMLSize_t bytesLeft = fRawBytesAvail - fRawBufIndex; // Move the existing ones down for (XMLSize_t index = 0; index < bytesLeft; index++) fRawByteBuf[index] = fRawByteBuf[fRawBufIndex + index]; // // And then read into the buffer past the existing bytes. Add back in // that many to the bytes read, and subtract that many from the bytes // requested. // fRawBytesAvail = fStream->readBytes ( &fRawByteBuf[bytesLeft], kRawBufSize - bytesLeft ) + bytesLeft; // // We need to reset the buffer index back to the start in all cases, // since any trailing data was copied down to the start. // fRawBufIndex = 0; } // // This method is called internally when we run out of characters in the // trancoded character buffer. We transcode up to another maxChars chars // from the // XMLSize_t XMLReader::xcodeMoreChars( XMLCh* const bufToFill , unsigned char* const charSizes , const XMLSize_t maxChars) { XMLSize_t charsDone = 0; XMLSize_t bytesEaten = 0; bool needMode = false; while (!bytesEaten) { // If our raw buffer is low, then lets load up another batch of // raw bytes now. // XMLSize_t bytesLeft = fRawBytesAvail - fRawBufIndex; if (needMode || bytesLeft == 0 || bytesLeft < fLowWaterMark) { refreshRawBuffer(); // If there are no characters or if we need more but didn't get // any, return zero now. // if (fRawBytesAvail == 0 || (needMode && (bytesLeft == fRawBytesAvail - fRawBufIndex))) return 0; } // Ask the transcoder to internalize another batch of chars. It is // possible that there is data in the raw buffer but the transcoder // is unable to produce anything because transcoding of multi-byte // encodings may have left a few bytes representing a partial // character in the buffer that can't be used until the next chunk // (and the rest of the character) is read. In this case set the // needMore flag and try again. // charsDone = fTranscoder->transcodeFrom ( &fRawByteBuf[fRawBufIndex] , fRawBytesAvail - fRawBufIndex , bufToFill , maxChars , bytesEaten , charSizes ); if (bytesEaten == 0) needMode = true; else fRawBufIndex += bytesEaten; } return charsDone; } /*** * * XML1.1 * * 2.11 End-of-Line Handling * * XML parsed entities are often stored in computer files which, for editing * convenience, are organized into lines. These lines are typically separated * by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA). * * To simplify the tasks of applications, the XML processor MUST behave as if * it normalized all line breaks in external parsed entities (including the document * entity) on input, before parsing, by translating all of the following to a single * #xA character: * * 1. the two-character sequence #xD #xA * 2. the two-character sequence #xD #x85 * 3. the single character #x85 * 4. the single character #x2028 * 5. any #xD character that is not immediately followed by #xA or #x85. * * ***/ void XMLReader::handleEOL(XMLCh& curCh, bool inDecl) { // 1. the two-character sequence #xD #xA // 2. the two-character sequence #xD #x85 // 5. any #xD character that is not immediately followed by #xA or #x85. switch(curCh) { case chCR: fCurCol = 1; fCurLine++; // // If not already internalized, then convert it to an // LF and eat any following LF. // if (fSource == Source_External) { if ((fCharIndex < fCharsAvail) || refreshCharBuffer()) { if ( fCharBuf[fCharIndex] == chLF || ((fCharBuf[fCharIndex] == chNEL) && fNEL) ) { fCharIndex++; } } curCh = chLF; } break; case chLF: fCurCol = 1; fCurLine++; break; // 3. the single character #x85 // 4. the single character #x2028 case chNEL: case chLineSeparator: if (inDecl && fXMLVersion == XMLV1_1) { /*** * XML1.1 * * 2.11 End-of-Line Handling * ... * The characters #x85 and #x2028 cannot be reliably recognized and translated * until an entity's encoding declaration (if present) has been read. * Therefore, it is a fatal error to use them within the XML declaration or * text declaration. * ***/ ThrowXMLwithMemMgr1 ( TranscodingException , XMLExcepts::Reader_NelLsepinDecl , fSystemId , fMemoryManager ); } if (fNEL && fSource == Source_External) { fCurCol = 1; fCurLine++; curCh = chLF; } break; default: fCurCol++; } } XERCES_CPP_NAMESPACE_END