/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * $Id$ */ // --------------------------------------------------------------------------- // Includes // --------------------------------------------------------------------------- #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include XERCES_CPP_NAMESPACE_BEGIN struct CMStateSetHasher { XMLSize_t getHashVal(const void *const key, XMLSize_t mod) { const CMStateSet* const pkey = (const CMStateSet*) key; return ((pkey->hashCode()) % mod); } bool equals(const void *const key1, const void *const key2) { const CMStateSet* const pkey1 = (const CMStateSet*) key1; const CMStateSet* const pkey2 = (const CMStateSet*) key2; return (*pkey1==*pkey2); } }; // --------------------------------------------------------------------------- // DFAContentModel: Constructors and Destructor // --------------------------------------------------------------------------- DFAContentModel::DFAContentModel( const bool dtd , ContentSpecNode* const elemContentSpec , MemoryManager* const manager) : fElemMap(0) , fElemMapType(0) , fElemMapSize(0) , fEmptyOk(false) , fEOCPos(0) , fFinalStateFlags(0) , fFollowList(0) , fHeadNode(0) , fLeafCount(0) , fLeafList(0) , fLeafListType(0) , fTransTable(0) , fTransTableSize(0) , fCountingStates(0) , fDTD(dtd) , fIsMixed(false) , fLeafNameTypeVector(0) , fMemoryManager(manager) { // And build the DFA data structures buildDFA(elemContentSpec); } DFAContentModel::DFAContentModel( const bool dtd , ContentSpecNode* const elemContentSpec , const bool isMixed , MemoryManager* const manager): fElemMap(0) , fElemMapType(0) , fElemMapSize(0) , fEmptyOk(false) , fEOCPos(0) , fFinalStateFlags(0) , fFollowList(0) , fHeadNode(0) , fLeafCount(0) , fLeafList(0) , fLeafListType(0) , fTransTable(0) , fTransTableSize(0) , fCountingStates(0) , fDTD(dtd) , fIsMixed(isMixed) , fLeafNameTypeVector(0) , fMemoryManager(manager) { // And build the DFA data structures buildDFA(elemContentSpec); } DFAContentModel::~DFAContentModel() { // // Clean up all the stuff that is not just temporary representation // data that was cleaned up after building the DFA. // fMemoryManager->deallocate(fFinalStateFlags); //delete [] fFinalStateFlags; unsigned int index; for (index = 0; index < fTransTableSize; index++) fMemoryManager->deallocate(fTransTable[index]); //delete [] fTransTable[index]; fMemoryManager->deallocate(fTransTable); //delete [] fTransTable; if(fCountingStates) { for (unsigned int j = 0; j < fTransTableSize; ++j) delete fCountingStates[j]; fMemoryManager->deallocate(fCountingStates); } for (index = 0; index < fLeafCount; index++) delete fElemMap[index]; fMemoryManager->deallocate(fElemMap); //delete [] fElemMap; fMemoryManager->deallocate(fElemMapType); //delete [] fElemMapType; fMemoryManager->deallocate(fLeafListType); //delete [] fLeafListType; delete fLeafNameTypeVector; } // --------------------------------------------------------------------------- // DFAContentModel: Implementation of the ContentModel virtual interface // --------------------------------------------------------------------------- bool DFAContentModel::validateContent( QName** const children , XMLSize_t childCount , unsigned int , XMLSize_t* indexFailingChild , MemoryManager* const) const { // // If there are no children, then either we fail on the 0th element // or we return success. It depends upon whether this content model // accepts empty content, which we determined earlier. // if (!childCount) { // success if(fEmptyOk) return true; *indexFailingChild=0; return false; } // // Lets loop through the children in the array and move our way // through the states. Note that we use the fElemMap array to map // an element index to a state index. // unsigned int curState = 0; unsigned int nextState = 0; unsigned int loopCount = 0; unsigned int childIndex = 0; for (; childIndex < childCount; childIndex++) { // Get the current element index out const QName* curElem = children[childIndex]; const XMLCh* curElemRawName = 0; if (fDTD) curElemRawName = curElem->getRawName(); // If this is text in a Schema mixed content model, skip it. if ( fIsMixed && ( curElem->getURI() == XMLElementDecl::fgPCDataElemId)) continue; // Look up this child in our element map unsigned int elemIndex = 0; for (; elemIndex < fElemMapSize; elemIndex++) { const QName* inElem = fElemMap[elemIndex]; if (fDTD) { if (XMLString::equals(inElem->getRawName(), curElemRawName)) { nextState = fTransTable[curState][elemIndex]; if (nextState != XMLContentModel::gInvalidTrans) break; } } else { ContentSpecNode::NodeTypes type = fElemMapType[elemIndex]; if (type == ContentSpecNode::Leaf) { if ((inElem->getURI() == curElem->getURI()) && (XMLString::equals(inElem->getLocalPart(), curElem->getLocalPart()))) { nextState = fTransTable[curState][elemIndex]; if (nextState != XMLContentModel::gInvalidTrans) break; } } else if ((type & 0x0f)== ContentSpecNode::Any) { nextState = fTransTable[curState][elemIndex]; if (nextState != XMLContentModel::gInvalidTrans) break; } else if ((type & 0x0f) == ContentSpecNode::Any_NS) { if (inElem->getURI() == curElem->getURI()) { nextState = fTransTable[curState][elemIndex]; if (nextState != XMLContentModel::gInvalidTrans) break; } } else if ((type & 0x0f) == ContentSpecNode::Any_Other) { // Here we assume that empty string has id 1. // unsigned int uriId = curElem->getURI(); if (uriId != 1 && uriId != inElem->getURI()) { nextState = fTransTable[curState][elemIndex]; if (nextState != XMLContentModel::gInvalidTrans) break; } } } }//for elemIndex // If "nextState" is -1, we found a match, but the transition is invalid if (nextState == XMLContentModel::gInvalidTrans) { *indexFailingChild=childIndex; return false; } // If we didn't find it, then obviously not valid if (elemIndex == fElemMapSize) { *indexFailingChild=childIndex; return false; } unsigned int nextLoop = 0; if(!handleRepetitions(curElem, curState, loopCount, nextState, nextLoop, elemIndex, 0)) { *indexFailingChild=childIndex; return false; } curState = nextState; loopCount = nextLoop; nextState = 0; }//for childIndex // // We transitioned all the way through the input list. However, that // does not mean that we ended in a final state. So check whether // our ending state is a final state. // if (!fFinalStateFlags[curState]) { *indexFailingChild=childIndex; return false; } // verify if we exited before the minOccurs was satisfied if (fCountingStates != 0) { Occurence* o = fCountingStates[curState]; if (o != 0 && loopCount < (unsigned int)o->minOccurs) { // not enough loops on the current state to be considered final. *indexFailingChild=childIndex; return false; } } //success return true; } bool DFAContentModel::validateContentSpecial(QName** const children , XMLSize_t childCount , unsigned int , GrammarResolver* const pGrammarResolver , XMLStringPool* const pStringPool , XMLSize_t* indexFailingChild , MemoryManager* const) const { SubstitutionGroupComparator comparator(pGrammarResolver, pStringPool); if (childCount == 0) { if(fEmptyOk) return true; *indexFailingChild=0; return false; } // // Lets loop through the children in the array and move our way // through the states. Note that we use the fElemMap array to map // an element index to a state index. // unsigned int curState = 0; unsigned int loopCount = 0; unsigned int nextState = 0; unsigned int childIndex = 0; for (; childIndex < childCount; childIndex++) { // Get the current element index out QName* curElem = children[childIndex]; // If this is text in a Schema mixed content model, skip it. if ( fIsMixed && ( curElem->getURI() == XMLElementDecl::fgPCDataElemId)) continue; // Look up this child in our element map unsigned int elemIndex = 0; for (; elemIndex < fElemMapSize; elemIndex++) { QName* inElem = fElemMap[elemIndex]; ContentSpecNode::NodeTypes type = fElemMapType[elemIndex]; if (type == ContentSpecNode::Leaf) { if (comparator.isEquivalentTo(curElem, inElem) ) { nextState = fTransTable[curState][elemIndex]; if (nextState != XMLContentModel::gInvalidTrans) break; } } else if ((type & 0x0f)== ContentSpecNode::Any) { nextState = fTransTable[curState][elemIndex]; if (nextState != XMLContentModel::gInvalidTrans) break; } else if ((type & 0x0f) == ContentSpecNode::Any_NS) { if (inElem->getURI() == curElem->getURI()) { nextState = fTransTable[curState][elemIndex]; if (nextState != XMLContentModel::gInvalidTrans) break; } } else if ((type & 0x0f) == ContentSpecNode::Any_Other) { // Here we assume that empty string has id 1. // unsigned int uriId = curElem->getURI(); if (uriId != 1 && uriId != inElem->getURI()) { nextState = fTransTable[curState][elemIndex]; if (nextState != XMLContentModel::gInvalidTrans) break; } } }//for elemIndex // If "nextState" is -1, we found a match, but the transition is invalid if (nextState == XMLContentModel::gInvalidTrans) { *indexFailingChild=childIndex; return false; } // If we didn't find it, then obviously not valid if (elemIndex == fElemMapSize) { *indexFailingChild=childIndex; return false; } unsigned int nextLoop = 0; if(!handleRepetitions(curElem, curState, loopCount, nextState, nextLoop, elemIndex, &comparator)) { *indexFailingChild=childIndex; return false; } curState = nextState; loopCount = nextLoop; nextState = 0; }//for childIndex // // We transitioned all the way through the input list. However, that // does not mean that we ended in a final state. So check whether // our ending state is a final state. // if (!fFinalStateFlags[curState]) { *indexFailingChild=childIndex; return false; } // verify if we exited before the minOccurs was satisfied if (fCountingStates != 0) { Occurence* o = fCountingStates[curState]; if (o != 0) { if (loopCount < (unsigned int)o->minOccurs) { // not enough loops on the current state. *indexFailingChild=childIndex; return false; } } } //success return true; } bool DFAContentModel::handleRepetitions(const QName* const curElem, unsigned int curState, unsigned int currentLoop, unsigned int& nextState, unsigned int& nextLoop, XMLSize_t elemIndex, SubstitutionGroupComparator * comparator) const { nextLoop = 0; if (fCountingStates != 0) { nextLoop = currentLoop; Occurence* o = fCountingStates[curState]; if (o != 0) { if (curState == nextState) { if (++nextLoop > (unsigned int)o->maxOccurs && o->maxOccurs != -1) { // It's likely that we looped too many times on the current state // however it's possible that we actually matched another particle // which allows the same name. // // Consider: // // // // // // // and // // // // // // // In the DFA there will be two transitions from the current state which // allow "foo". Note that this is not a UPA violation. The ambiguity of which // transition to take is resolved by the current value of the counter. Since // we've already seen enough instances of the first "foo" perhaps there is // another element declaration or wildcard deeper in the element map which // matches. unsigned int tempNextState = 0; while (++elemIndex < fElemMapSize) { QName* inElem = fElemMap[elemIndex]; ContentSpecNode::NodeTypes type = fElemMapType[elemIndex]; if (type == ContentSpecNode::Leaf) { if(comparator!=0) { if (comparator->isEquivalentTo(curElem, inElem) ) { tempNextState = fTransTable[curState][elemIndex]; if (tempNextState != XMLContentModel::gInvalidTrans) break; } } else if (fDTD) { if (XMLString::equals(inElem->getRawName(), curElem->getRawName())) { tempNextState = fTransTable[curState][elemIndex]; if (tempNextState != XMLContentModel::gInvalidTrans) break; } } else { if ((inElem->getURI() == curElem->getURI()) && (XMLString::equals(inElem->getLocalPart(), curElem->getLocalPart()))) { tempNextState = fTransTable[curState][elemIndex]; if (tempNextState != XMLContentModel::gInvalidTrans) break; } } } else if ((type & 0x0f)== ContentSpecNode::Any) { tempNextState = fTransTable[curState][elemIndex]; if (tempNextState != XMLContentModel::gInvalidTrans) break; } else if ((type & 0x0f) == ContentSpecNode::Any_NS) { if (inElem->getURI() == curElem->getURI()) { tempNextState = fTransTable[curState][elemIndex]; if (tempNextState != XMLContentModel::gInvalidTrans) break; } } else if ((type & 0x0f) == ContentSpecNode::Any_Other) { // Here we assume that empty string has id 1. // unsigned int uriId = curElem->getURI(); if (uriId != 1 && uriId != inElem->getURI()) { tempNextState = fTransTable[curState][elemIndex]; if (tempNextState != XMLContentModel::gInvalidTrans) break; } } } // if we still can't find a match, report the error if (elemIndex == fElemMapSize) return false; // if we found a match, set the next state and reset the // counter if the next state is a counting state. nextState = tempNextState; Occurence* o = fCountingStates[nextState]; if (o != 0) { nextLoop = (elemIndex == XMLSize_t (o->elemIndex)) ? 1 : 0; } } } else if (nextLoop < (unsigned int)o->minOccurs) { // not enough loops on the current state. return false; } else { // Exiting a counting state. If we're entering a new // counting state, reset the counter. o = fCountingStates[nextState]; if (o != 0) { nextLoop = (elemIndex == XMLSize_t (o->elemIndex)) ? 1 : 0; } } } else { o = fCountingStates[nextState]; if (o != 0) { // Entering a new counting state. Reset the counter. // If we've already seen one instance of the looping // particle set the counter to 1, otherwise set it // to 0. nextLoop = (elemIndex == XMLSize_t (o->elemIndex)) ? 1 : 0; } } } return true; } // --------------------------------------------------------------------------- // DFAContentModel: Private helper methods // --------------------------------------------------------------------------- void DFAContentModel::buildDFA(ContentSpecNode* const curNode) { unsigned int index; // // The first step we need to take is to rewrite the content model using // our CMNode objects, and in the process get rid of any repetition short // cuts, converting them into '*' style repetitions or getting rid of // repetitions altogether. // // The conversions done are: // // x+ -> (x|x*) // x? -> (x|epsilon) // // This is a relatively complex scenario. What is happening is that we // create a top level binary node of which the special EOC value is set // as the right side node. The the left side is set to the rewritten // syntax tree. The source is the original content model info from the // decl pool. The rewrite is done by buildSyntaxTree() which recurses the // decl pool's content of the element and builds a new tree in the // process. // // Note that, during this operation, we set each non-epsilon leaf node's // DFA state position and count the number of such leafs, which is left // in the fLeafCount member. // fLeafCount=countLeafNodes(curNode); fEOCPos = fLeafCount++; // We need to build an array of references to the non-epsilon // leaf nodes. We will put them in the array according to their position values // fLeafList = (CMLeaf**) fMemoryManager->allocate(fLeafCount*sizeof(CMLeaf*)); //new CMLeaf*[fLeafCount]; fLeafListType = (ContentSpecNode::NodeTypes*) fMemoryManager->allocate ( fLeafCount * sizeof(ContentSpecNode::NodeTypes) ); //new ContentSpecNode::NodeTypes[fLeafCount]; // // And, moving onward... We now need to build the follow position sets // for all the nodes. So we allocate an array of pointers to state sets, // one for each leaf node (i.e. each significant DFA position.) // fFollowList = (CMStateSet**) fMemoryManager->allocate ( fLeafCount * sizeof(CMStateSet*) ); //new CMStateSet*[fLeafCount]; for (index = 0; index < fLeafCount; index++) fFollowList[index] = new (fMemoryManager) CMStateSet(fLeafCount, fMemoryManager); // The buildSyntaxTree function will recursively iterate over the ContentSpecNode // and build the CMNode hierarchy; it will also put every leaf node in the fLeafList // array, then calculate the first and last position sets of each node. This is // cached away in each of the nodes. // // Along the way we also set the leaf count in each node as the maximum // state count. They must know this in order to create their first/last // position sets. // unsigned int counter=0; CMNode* nodeOrgContent = buildSyntaxTree(curNode, counter); // // Check to see whether this content model can handle an empty content, // which is something we need to optimize by looking now before we // throw away the info that would tell us that. // // If the left node of the head (the top level of the original content) // is nullable, then its true. // fEmptyOk = nodeOrgContent->isNullable(); // // And handle specially the EOC node, which also must be numbered and // counted as a non-epsilon leaf node. It could not be handled in the // above tree build because it was created before all that started. We // save the EOC position since its used during the DFA building loop. // CMLeaf* nodeEOC = new (fMemoryManager) CMLeaf ( new (fMemoryManager) QName ( XMLUni::fgZeroLenString , XMLUni::fgZeroLenString , XMLContentModel::gEOCFakeId , fMemoryManager ) , fEOCPos , true , fLeafCount , fMemoryManager ); fHeadNode = new (fMemoryManager) CMBinaryOp ( ContentSpecNode::Sequence , nodeOrgContent , nodeEOC , fLeafCount , fMemoryManager ); // Put also the final EOC node in the leaf array fLeafList[counter] = new (fMemoryManager) CMLeaf ( nodeEOC->getElement() , nodeEOC->getPosition() , fLeafCount , fMemoryManager ); fLeafListType[counter] = ContentSpecNode::Leaf; // // Now handle our top level. We use our left child's last pos set and our // right child's first pos set, so get them now for convenience. // const CMStateSet& last = nodeOrgContent->getLastPos(); const CMStateSet& first = nodeEOC->getFirstPos(); // // Now, for every position which is in our left child's last set // add all of the states in our right child's first set to the // follow set for that position. // CMStateSetEnumerator enumLast(&last); while(enumLast.hasMoreElements()) { XMLSize_t index=enumLast.nextElement(); *fFollowList[index] |= first; } // // And finally the big push... Now we build the DFA using all the states // and the tree we've built up. First we set up the various data // structures we are going to use while we do this. // // First of all we need an array of unique element ids in our content // model. For each transition table entry, we need a set of contiguous // indices to represent the transitions for a particular input element. // So we need to a zero based range of indexes that map to element types. // This element map provides that mapping. // fElemMap = (QName**) fMemoryManager->allocate ( fLeafCount * sizeof(QName*) ); //new QName*[fLeafCount]; fElemMapType = (ContentSpecNode::NodeTypes*) fMemoryManager->allocate ( fLeafCount * sizeof(ContentSpecNode::NodeTypes) ); //new ContentSpecNode::NodeTypes[fLeafCount]; fElemMapSize = 0; Occurence** elemOccurenceMap=0; for (unsigned int outIndex = 0; outIndex < fLeafCount; outIndex++) { fElemMap[outIndex] = new (fMemoryManager) QName(fMemoryManager); if ( (fLeafListType[outIndex] & 0x0f) != ContentSpecNode::Leaf ) if (!fLeafNameTypeVector) fLeafNameTypeVector = new (fMemoryManager) ContentLeafNameTypeVector(fMemoryManager); // Get the current leaf's element index CMLeaf* leaf=fLeafList[outIndex]; const QName* element = leaf->getElement(); const XMLCh* elementRawName = 0; if (fDTD && element) elementRawName = element->getRawName(); // See if the current leaf node's element index is in the list unsigned int inIndex = 0; for (; inIndex < fElemMapSize; inIndex++) { const QName* inElem = fElemMap[inIndex]; if (fDTD) { if (XMLString::equals(inElem->getRawName(), elementRawName)) { break; } } else { if ((fElemMapType[inIndex] == fLeafListType[outIndex]) && (inElem->getURI() == element->getURI()) && (XMLString::equals(inElem->getLocalPart(), element->getLocalPart()))) { break; } } } // If it was not in the list, then add it and bump the map size if (inIndex == fElemMapSize) { fElemMap[fElemMapSize]->setValues(*element); if(leaf->isRepeatableLeaf()) { if (elemOccurenceMap == 0) { elemOccurenceMap = (Occurence**)fMemoryManager->allocate(fLeafCount*sizeof(Occurence*)); memset(elemOccurenceMap, 0, fLeafCount*sizeof(Occurence*)); } elemOccurenceMap[fElemMapSize] = new (fMemoryManager) Occurence(((CMRepeatingLeaf*)leaf)->getMinOccurs(), ((CMRepeatingLeaf*)leaf)->getMaxOccurs(), fElemMapSize); } fElemMapType[fElemMapSize] = fLeafListType[outIndex]; ++fElemMapSize; } } // set up the fLeafNameTypeVector object if there is one. if (fLeafNameTypeVector) { fLeafNameTypeVector->setValues(fElemMap, fElemMapType, fElemMapSize); } /*** * Optimization(Jan, 2001); We sort fLeafList according to * elemIndex which is *uniquely* associated to each leaf. * We are *assuming* that each element appears in at least one leaf. **/ // don't forget to delete it #ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH int *fLeafSorter = (int*) fMemoryManager->allocate ( (fLeafCount + fElemMapSize) * sizeof(int) ); //new int[fLeafCount + fElemMapSize]; unsigned int fSortCount = 0; for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) { const QName* element = fElemMap[elemIndex]; const XMLCh* elementRawName = 0; if (fDTD && element) elementRawName = element->getRawName(); for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) { const QName* leaf = fLeafList[leafIndex]->getElement(); if (fDTD) { if (XMLString::equals(leaf->getRawName(), elementRawName)) { fLeafSorter[fSortCount++] = leafIndex; } } else { if ((fElemMapType[elemIndex] == fLeafListType[leafIndex]) && (leaf->getURI() == element->getURI()) && (XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) { fLeafSorter[fSortCount++] = leafIndex; } } } fLeafSorter[fSortCount++] = -1; } #endif // instead of using a single array with -1 to separate elements, use a bidimensional map unsigned int** fLeafSorter = (unsigned int**)fMemoryManager->allocate(fElemMapSize * sizeof(unsigned int*)); unsigned int* tmpSorter = (unsigned int*)fMemoryManager->allocate(fLeafCount * sizeof(unsigned int)); for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) { const QName* element = fElemMap[elemIndex]; const XMLCh* elementRawName = 0; if (fDTD && element) elementRawName = element->getRawName(); unsigned int fSortCount=0; for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) { const QName* leaf = fLeafList[leafIndex]->getElement(); if (fDTD) { if (XMLString::equals(leaf->getRawName(), elementRawName)) { tmpSorter[fSortCount++] = leafIndex; } } else { if ((fElemMapType[elemIndex] == fLeafListType[leafIndex]) && (leaf->getURI() == element->getURI()) && (XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) { tmpSorter[fSortCount++] = leafIndex; } } } fLeafSorter[elemIndex]=(unsigned int*)fMemoryManager->allocate((fSortCount+1) * sizeof(unsigned int)); fLeafSorter[elemIndex][0]=fSortCount; for (unsigned int index=0;indexdeallocate(tmpSorter); // // Next lets create some arrays, some that that hold transient info // during the DFA build and some that are permament. These are kind of // sticky since we cannot know how big they will get, but we don't want // to use any collection type classes because of performance. // // Basically they will probably be about fLeafCount*2 on average, but can // be as large as 2^(fLeafCount*2), worst case. So we start with // fLeafCount*4 as a middle ground. This will be very unlikely to ever // have to expand though, it if does, the overhead will be somewhat ugly. // unsigned int curArraySize = fLeafCount * 4; CMStateSet** statesToDo = (CMStateSet**) fMemoryManager->allocate ( curArraySize * sizeof(CMStateSet*) ); //new const CMStateSet*[curArraySize]; fFinalStateFlags = (bool*) fMemoryManager->allocate ( curArraySize * sizeof(bool) ); //new bool[curArraySize]; fTransTable = (unsigned int**) fMemoryManager->allocate ( curArraySize * sizeof(unsigned int*) ); //new unsigned int*[curArraySize]; // // Ok we start with the initial set as the first pos set of the head node // (which is the seq node that holds the content model and the EOC node.) // CMStateSet* setT = new (fMemoryManager) CMStateSet(fHeadNode->getFirstPos()); // // Note on memory leak: Bugzilla#2707: // =================================== // The CMBinary, pointed to by fHeadNode, shall be released by // deleted by itself. // // fLeafList[] maintains its **OWN** copy of CMLeaf to avoid double deletion // of CMLeaf. // delete fHeadNode; // // Init our two state flags. Basically the unmarked state counter is // always chasing the current state counter. When it catches up, that // means we made a pass through that did not add any new states to the // lists, at which time we are done. We could have used a expanding array // of flags which we used to mark off states as we complete them, but // this is easier though less readable maybe. // unsigned int unmarkedState = 0; unsigned int curState = 0; // // Init the first transition table entry, and put the initial state // into the states to do list, then bump the current state. // fTransTable[curState] = makeDefStateList(); statesToDo[curState] = setT; curState++; // // the stateTable is an auxiliary means to fast // identification of new state created (instead // of sequential loop statesToDo to find out), // while the role that statesToDo plays remain unchanged. // RefHashTableOf *stateTable = new (fMemoryManager) RefHashTableOf ( curArraySize , true , fMemoryManager ); //stateTable->put((CMStateSet*)setT, new (fMemoryManager) XMLInteger(0)); // // Ok, almost done with the algorithm from hell... We now enter the // loop where we go until the states done counter catches up with // the states to do counter. // CMStateSet* newSet = 0; while (unmarkedState < curState) { // // Get the next unmarked state out of the list of states to do. // And get the associated transition table entry. // setT = statesToDo[unmarkedState]; unsigned int* transEntry = fTransTable[unmarkedState]; // Mark this one final if it contains the EOC state fFinalStateFlags[unmarkedState] = setT->getBit(fEOCPos); // Bump up the unmarked state count, marking this state done unmarkedState++; #ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH // Optimization(Jan, 2001) unsigned int sorterIndex = 0; // Optimization(Jan, 2001) #endif // Loop through each possible input symbol in the element map for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) { // // Build up a set of states which is the union of all of the // follow sets of DFA positions that are in the current state. If // we gave away the new set last time through then create a new // one. Otherwise, zero out the existing one. // if (!newSet) newSet = new (fMemoryManager) CMStateSet ( fLeafCount , fMemoryManager ); else newSet->zeroBits(); #ifdef OBSOLETED // unoptimized code for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) { // If this leaf index (DFA position) is in the current set... if (setT->getBit(leafIndex)) { // // If this leaf is the current input symbol, then we want // to add its follow list to the set of states to transition // to from the current state. // const QName* leaf = fLeafList[leafIndex]->getElement(); const QName* element = fElemMap[elemIndex]; if (fDTD) { if (XMLString::equals(leaf->getRawName(), element->getRawName())) { *newSet |= *fFollowList[leafIndex]; } } else { if ((leaf->getURI() == element->getURI()) && (XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) { *newSet |= *fFollowList[leafIndex]; } } } } // for leafIndex #endif #ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH // Optimization(Jan, 2001) int leafIndex = fLeafSorter[sorterIndex++]; while (leafIndex != -1) { // If this leaf index (DFA position) is in the current set... if (setT->getBit(leafIndex)) { // // If this leaf is the current input symbol, then we // want to add its follow list to the set of states to // transition to from the current state. // *newSet |= *fFollowList[leafIndex]; } leafIndex = fLeafSorter[sorterIndex++]; } // while (leafIndex != -1) #endif unsigned int* fLeafIndexes=fLeafSorter[elemIndex]; unsigned int fNumItems=fLeafIndexes[0]; if(fNumItems!=0) { // The algorithm requires finding the leaf that is present both in the bitfield of the current state, and in the // list of places where the currently tested item can appear. When this occurs, the follow list of this parent item // is added to the bitfield representing the next state. // Both the bitfield and the list of places are sorted, so we can analyze them in two ways; either iterating over the // parent items, testing the bitfield for the existence of the parent (N times a constant Tb), or by iterating over the // bitfield (restricted to the range of the sorted list of places), using a binary search to locate the leaf in the // sorted list of places (M times log(N) testing operations Ts) // Assuming that the time to test a bit is roughly the same of the time needed to compute the average of two integers, // plus a couple of comparisons and additions, we compare N agains M*log(N) to decide which algorithm should be faster given // the two sets if(fNumItems <= setT->getBitCountInRange(fLeafIndexes[1], fLeafIndexes[fNumItems])*log((float)fNumItems)) { for(unsigned int i=1; i<=fNumItems; ++i) if(setT->getBit(fLeafIndexes[i])) { // // If this leaf is the current input symbol, then we // want to add its follow list to the set of states to // transition to from the current state. // *newSet |= *fFollowList[ fLeafIndexes[i] ]; } } else { // Further optimization: given that the bitfield enumerator returns the numbers in order, // every time we raise the lower marker we know it will true also for the next bits, so // the next binary search will not start from 1 but from this index unsigned int lowIndex = 1; // Start the enumerator from the first index in the sorted list of places, // as nothing before that point will match CMStateSetEnumerator enumBits(setT, fLeafIndexes[1]); while(enumBits.hasMoreElements()) { unsigned int bitIndex=enumBits.nextElement(); // if this leaf is greater than the last index in the sorted list of places, // nothing can be found from now on, so get out of here if(bitIndex > fLeafIndexes[fNumItems]) break; // Check if this leaf index (DFA position) is in the current set // (using binary search: the indexes are sorted) unsigned int first=lowIndex,last=fNumItems,i; while(first<=last) { i=(first+last)/2; if(fLeafIndexes[i]>bitIndex) last=i-1; else if(fLeafIndexes[i]isEmpty()) { // // Search the 'states to do' list to see if this new // state set is already in there. // /*** unsigned int stateIndex = 0; for (; stateIndex < curState; stateIndex++) { if (*statesToDo[stateIndex] == *newSet) break; } ***/ XMLInteger *stateObj = stateTable->get(newSet); unsigned int stateIndex = (stateObj == 0 ? curState : stateObj->intValue()); // If we did not find it, then add it if (stateIndex == curState) { // // Put this new state into the states to do and init // a new entry at the same index in the transition // table. // statesToDo[curState] = newSet; fTransTable[curState] = makeDefStateList(); stateTable->put ( newSet , new (fMemoryManager) XMLInteger(curState) ); // We now have a new state to do so bump the count curState++; // // Null out the new set to indicate we adopted it. This // will cause the creation of a new set on the next time // around the loop. // newSet = 0; } // // Now set this state in the transition table's entry for this // element (using its index), with the DFA state we will move // to from the current state when we see this input element. // transEntry[elemIndex] = stateIndex; // Expand the arrays if we're full if (curState == curArraySize) { // // Yikes, we overflowed the initial array size, so we've // got to expand all of these arrays. So adjust up the // size by 50% and allocate new arrays. // const unsigned int newSize = (unsigned int)(curArraySize * 1.5); CMStateSet** newToDo = (CMStateSet**) fMemoryManager->allocate ( newSize * sizeof(CMStateSet*) ); //new const CMStateSet*[newSize]; bool* newFinalFlags = (bool*) fMemoryManager->allocate ( newSize * sizeof(bool) ); //new bool[newSize]; unsigned int** newTransTable = (unsigned int**) fMemoryManager->allocate ( newSize * sizeof(unsigned int*) ); //new unsigned int*[newSize]; // Copy over all of the existing content for (unsigned int expIndex = 0; expIndex < curArraySize; expIndex++) { newToDo[expIndex] = statesToDo[expIndex]; newFinalFlags[expIndex] = fFinalStateFlags[expIndex]; newTransTable[expIndex] = fTransTable[expIndex]; } // Clean up the old stuff fMemoryManager->deallocate(statesToDo); //delete [] statesToDo; fMemoryManager->deallocate(fFinalStateFlags); //delete [] fFinalStateFlags; fMemoryManager->deallocate(fTransTable); //delete [] fTransTable; // Store the new array size and pointers curArraySize = newSize; statesToDo = newToDo; fFinalStateFlags = newFinalFlags; fTransTable = newTransTable; } //if (curState == curArraySize) } //if (!newSet->isEmpty()) } // for elemIndex } //while // Store the current state count in the trans table size fTransTableSize = curState; // // Fill in the occurence information for each looping state // if we're using counters. // if (elemOccurenceMap != 0) { fCountingStates = (Occurence**)fMemoryManager->allocate(fTransTableSize*sizeof(Occurence)); memset(fCountingStates, 0, fTransTableSize*sizeof(Occurence*)); for (unsigned int i = 0; i < fTransTableSize; ++i) { unsigned int * transitions = fTransTable[i]; for (unsigned int j = 0; j < fElemMapSize; ++j) { if (i == transitions[j]) { Occurence* old=elemOccurenceMap[j]; if(old!=0) fCountingStates[i] = new (fMemoryManager) Occurence(old->minOccurs, old->maxOccurs, old->elemIndex); break; } } } for (unsigned int j = 0; j < fLeafCount; ++j) { if(elemOccurenceMap[j]!=0) delete elemOccurenceMap[j]; } fMemoryManager->deallocate(elemOccurenceMap); } // If the last temp set was not stored, then clean it up if (newSet) delete newSet; // // Now we can clean up all of the temporary data that was needed during // DFA build. // for (index = 0; index < fLeafCount; index++) delete fFollowList[index]; fMemoryManager->deallocate(fFollowList); //delete [] fFollowList; // // removeAll() will delete all data, XMLInteger, // while the keys are to be deleted by the // deletion of statesToDo. // delete stateTable; for (index = 0; index < curState; index++) delete statesToDo[index]; fMemoryManager->deallocate(statesToDo); //delete [] statesToDo; for (index = 0; index < fLeafCount; index++) delete fLeafList[index]; fMemoryManager->deallocate(fLeafList); //delete [] fLeafList; #ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH fMemoryManager->deallocate(fLeafSorter); //delete [] fLeafSorter; #endif for (index=0; index < fElemMapSize; index++) fMemoryManager->deallocate(fLeafSorter[index]); fMemoryManager->deallocate(fLeafSorter); } unsigned int DFAContentModel::countLeafNodes(ContentSpecNode* const curNode) { unsigned int count = 0; // Get the spec type of the passed node const ContentSpecNode::NodeTypes curType = curNode->getType(); if ((curType & 0x0f) == ContentSpecNode::Any || (curType & 0x0f) == ContentSpecNode::Any_Other || (curType & 0x0f) == ContentSpecNode::Any_NS || curType == ContentSpecNode::Leaf || curType == ContentSpecNode::Loop) { count++; } else { // // Its not a leaf, so we have to recurse its left and maybe right // nodes. Save both values before we recurse and trash the node. // ContentSpecNode* leftNode = curNode->getFirst(); ContentSpecNode* rightNode = curNode->getSecond(); // Detect if we have a deep tree that can be analyzed using a loop instead of recursion unsigned int nLoopCount=0; ContentSpecNode* cursor=curNode; while(cursor->getType()==ContentSpecNode::Sequence && cursor->getFirst() && cursor->getFirst()->getSecond()==rightNode) { nLoopCount++; cursor=cursor->getFirst(); } if(nLoopCount!=0) { count += countLeafNodes(cursor); for(unsigned int i=0;igetType(); if ((curType & 0x0f) == ContentSpecNode::Any || (curType & 0x0f) == ContentSpecNode::Any_Other || (curType & 0x0f) == ContentSpecNode::Any_NS) { retNode = new (fMemoryManager) CMAny ( curType , curNode->getElement()->getURI() , curIndex , fLeafCount , fMemoryManager ); fLeafList[curIndex] = new (fMemoryManager) CMLeaf ( new (fMemoryManager) QName ( XMLUni::fgZeroLenString , XMLUni::fgZeroLenString , curNode->getElement()->getURI() , fMemoryManager ) , curIndex , true , fLeafCount , fMemoryManager ); fLeafListType[curIndex] = curType; ++curIndex; } else if (curType == ContentSpecNode::Leaf) { // // Create a new leaf node, and pass it the current leaf count, which // is its DFA state position. Bump the leaf count after storing it. // This makes the positions zero based since we store first and then // increment. // retNode = new (fMemoryManager) CMLeaf ( curNode->getElement() , curIndex , fLeafCount , fMemoryManager ); fLeafList[curIndex] = new (fMemoryManager) CMLeaf ( curNode->getElement() , curIndex , fLeafCount , fMemoryManager ); fLeafListType[curIndex] = ContentSpecNode::Leaf; ++curIndex; } else if (curType == ContentSpecNode::Loop) { // // Create a new leaf node, and pass it the current leaf count, which // is its DFA state position. Bump the leaf count after storing it. // This makes the positions zero based since we store first and then // increment. // retNode = new (fMemoryManager) CMRepeatingLeaf ( curNode->getFirst()->getElement() , curNode->getMinOccurs() , curNode->getMaxOccurs() , curIndex , fLeafCount , fMemoryManager ); fLeafList[curIndex] = new (fMemoryManager) CMRepeatingLeaf ( curNode->getFirst()->getElement() , curNode->getMinOccurs() , curNode->getMaxOccurs() , curIndex , fLeafCount , fMemoryManager ); fLeafListType[curIndex] = curNode->getFirst()->getType(); ++curIndex; } else { // // Its not a leaf, so we have to recurse its left and maybe right // nodes. Save both values before we recurse and trash the node. // ContentSpecNode* leftNode = curNode->getFirst(); ContentSpecNode* rightNode = curNode->getSecond(); // Detect if we have a deep tree that can be analyzed using a loop instead of recursion unsigned int nLoopCount=0; ContentSpecNode* cursor=curNode; while(cursor->getType()==ContentSpecNode::Sequence && cursor->getFirst() && cursor->getFirst()->getSecond()==rightNode) { nLoopCount++; cursor=cursor->getFirst(); } if(nLoopCount!=0) { retNode = buildSyntaxTree(cursor, curIndex); for(unsigned int i=0;igetLastPos(); const CMStateSet& first = newRight->getFirstPos(); // // Now, for every position which is in our left child's last set // add all of the states in our right child's first set to the // follow set for that position. // CMStateSetEnumerator enumLast(&last); while(enumLast.hasMoreElements()) { XMLSize_t index=enumLast.nextElement(); *fFollowList[index] |= first; } retNode = new (fMemoryManager) CMBinaryOp ( ContentSpecNode::Sequence , retNode , newRight , fLeafCount , fMemoryManager ); } return retNode; } if (((curType & 0x0f) == ContentSpecNode::Choice) || ((curType & 0x0f) == ContentSpecNode::Sequence)) { // // Recurse on both children, and return a binary op node with the // two created sub nodes as its children. The node type is the // same type as the source. // CMNode* newLeft = buildSyntaxTree(leftNode, curIndex); CMNode* newRight = buildSyntaxTree(rightNode, curIndex); if(((curType & 0x0f) == ContentSpecNode::Sequence)) { // // Now handle our level. We use our left child's last pos set and our // right child's first pos set, so get them now for convenience. // const CMStateSet& last = newLeft->getLastPos(); const CMStateSet& first = newRight->getFirstPos(); // // Now, for every position which is in our left child's last set // add all of the states in our right child's first set to the // follow set for that position. // CMStateSetEnumerator enumLast(&last); while(enumLast.hasMoreElements()) { XMLSize_t index=enumLast.nextElement(); *fFollowList[index] |= first; } } retNode = new (fMemoryManager) CMBinaryOp ( curType , newLeft , newRight , fLeafCount , fMemoryManager ); } else if (curType == ContentSpecNode::ZeroOrMore || curType == ContentSpecNode::ZeroOrOne || curType == ContentSpecNode::OneOrMore) { CMNode* newChild = buildSyntaxTree(leftNode, curIndex); if (curType == ContentSpecNode::ZeroOrMore || curType == ContentSpecNode::OneOrMore) { // // Now handle our level. We use our own first and last position // sets, so get them up front. // const CMStateSet& first = newChild->getFirstPos(); const CMStateSet& last = newChild->getLastPos(); // // For every position which is in our last position set, add all // of our first position states to the follow set for that // position. // CMStateSetEnumerator enumLast(&last); while(enumLast.hasMoreElements()) { XMLSize_t index=enumLast.nextElement(); *fFollowList[index] |= first; } } // This one is fine as is, just change to our form retNode = new (fMemoryManager) CMUnaryOp ( curType , newChild , fLeafCount , fMemoryManager ); } else { ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::CM_UnknownCMSpecType, fMemoryManager); } } // fault in the first and last pos, then delete it children retNode->getFirstPos(); retNode->getLastPos(); retNode->orphanChild(); return retNode; } // // gInvalidTrans is used to represent bad transitions in the transition table // entry for each state. So each entry is initialized to that value. This // method creates a new entry and initializes it. // unsigned int* DFAContentModel::makeDefStateList() const { unsigned int* retArray = (unsigned int*) fMemoryManager->allocate ( fElemMapSize * sizeof(unsigned int) ); //new unsigned int[fElemMapSize]; for (unsigned int index = 0; index < fElemMapSize; index++) retArray[index] = XMLContentModel::gInvalidTrans; return retArray; } ContentLeafNameTypeVector* DFAContentModel::getContentLeafNameTypeVector() const { //later change it to return the data member return fLeafNameTypeVector; } void DFAContentModel::checkUniqueParticleAttribution (SchemaGrammar* const pGrammar, GrammarResolver* const pGrammarResolver, XMLStringPool* const pStringPool, XMLValidator* const pValidator, unsigned int* const pContentSpecOrgURI, const XMLCh* pComplexTypeName /*= 0*/) { SubstitutionGroupComparator comparator(pGrammarResolver, pStringPool); unsigned int i, j, k; // Rename the URI back for (i = 0; i < fElemMapSize; i++) { unsigned int orgURIIndex = fElemMap[i]->getURI(); if ((orgURIIndex != XMLContentModel::gEOCFakeId) && (orgURIIndex != XMLContentModel::gEpsilonFakeId) && (orgURIIndex != XMLElementDecl::fgInvalidElemId) && (orgURIIndex != XMLElementDecl::fgPCDataElemId)) { fElemMap[i]->setURI(pContentSpecOrgURI[orgURIIndex]); } } // Unique Particle Attribution // Store the conflict results between any two elements in fElemMap // 0 - not yet tested, 1 - conflict, (-1) - no conflict signed char** conflictTable = (signed char**) fMemoryManager->allocate ( fElemMapSize * sizeof(signed char*) ); // initialize the conflict table for (j = 0; j < fElemMapSize; j++) { conflictTable[j] = (signed char*) fMemoryManager->allocate ( fElemMapSize * sizeof(signed char) ); memset(conflictTable[j], 0, fElemMapSize*sizeof(signed char)); } // for each state, check whether it has overlap transitions for (i = 0; i < fTransTableSize; i++) { for (j = 0; j < fElemMapSize; j++) { for (k = j+1; k < fElemMapSize; k++) { if (fTransTable[i][j] != XMLContentModel::gInvalidTrans && fTransTable[i][k] != XMLContentModel::gInvalidTrans && conflictTable[j][k] == 0) { // If this is text in a Schema mixed content model, skip it. if ( fIsMixed && (( fElemMap[j]->getURI() == XMLElementDecl::fgPCDataElemId) || ( fElemMap[k]->getURI() == XMLElementDecl::fgPCDataElemId))) continue; if (XercesElementWildcard::conflict(pGrammar, fElemMapType[j], fElemMap[j], fElemMapType[k], fElemMap[k], &comparator)) { if (fCountingStates != 0) { Occurence* o = fCountingStates[i]; // If "i" is a counting state and exactly one of the transitions // loops back to "i" then the two particles do not overlap if // minOccurs == maxOccurs. if (o != 0 && ((fTransTable[i][j] == i) ^ (fTransTable[i][k] == i)) && o->minOccurs == o->maxOccurs) { conflictTable[j][k] = -1; continue; } } conflictTable[j][k] = 1; XMLBuffer buf1(1023, fMemoryManager); if (((fElemMapType[j] & 0x0f) == ContentSpecNode::Any) || ((fElemMapType[j] & 0x0f) == ContentSpecNode::Any_NS)) buf1.set(SchemaSymbols::fgATTVAL_TWOPOUNDANY); else if ((fElemMapType[j] & 0x0f) == ContentSpecNode::Any_Other) buf1.set(SchemaSymbols::fgATTVAL_TWOPOUNDOTHER); else buf1.set(fElemMap[j]->getRawName()); XMLBuffer buf2(1023, fMemoryManager); if (((fElemMapType[k] & 0x0f) == ContentSpecNode::Any) || ((fElemMapType[k] & 0x0f) == ContentSpecNode::Any_NS)) buf2.set(SchemaSymbols::fgATTVAL_TWOPOUNDANY); else if ((fElemMapType[k] & 0x0f) == ContentSpecNode::Any_Other) buf2.set(SchemaSymbols::fgATTVAL_TWOPOUNDOTHER); else buf2.set(fElemMap[k]->getRawName()); pValidator->emitError(XMLValid::UniqueParticleAttributionFail, pComplexTypeName, buf1.getRawBuffer(), buf2.getRawBuffer()); } else conflictTable[j][k] = -1; } } } } for (i = 0; i < fElemMapSize; i++) fMemoryManager->deallocate(conflictTable[i]); fMemoryManager->deallocate(conflictTable); } XERCES_CPP_NAMESPACE_END