/* Copyright (c) 2004-2015 by Jakob Schröter This file is part of the gloox library. http://camaya.net/gloox This software is distributed under a license. The full license agreement can be found in the file LICENSE in this distribution. This software may not be copied, modified, sold or distributed other than expressed in the named license agreement. This software is distributed without any warranty. */ #include "gloox.h" #include "util.h" #include "parser.h" #include namespace gloox { Parser::Parser( TagHandler* ph, bool deleteRoot ) : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_xmlnss( 0 ), m_state( Initial ), m_preamble( 0 ), m_quote( false ), m_haveTagPrefix( false ), m_haveAttribPrefix( false ), m_attribIsXmlns( false ), m_deleteRoot( deleteRoot ) { } Parser::~Parser() { delete m_root; delete m_xmlnss; } Parser::DecodeState Parser::decode( std::string::size_type& pos, const std::string& data ) { std::string::size_type p = data.find( ';', pos ); std::string::size_type diff = p - pos; if( p == std::string::npos ) { m_backBuffer = data.substr( pos ); return DecodeInsufficient; } if( diff < 3 || diff > 9 ) return DecodeInvalid; std::string rep; switch( data[pos + 1] ) { case '#': { int base = 10; int idx = 2; if( data[pos + 2] == 'x' || data[pos + 2] == 'X' ) { base = 16; idx = 3; } char* end; const long int val = std::strtol( data.data() + pos + idx, &end, base ); if( *end != ';' || val < 0 ) return DecodeInvalid; if( val == 0x9 || val == 0xA || val == 0xD || ( val >= 0x20 && val <= 0x7F ) ) { rep += char( val ); } else if( val >= 0x80 && val <= 0x7FF ) { rep += char( 192 + ( val >> 6 ) ); rep += char( 128 + ( val % 64 ) ); } else if( ( val >= 0x800 && val <= 0xD7FF ) || ( val >= 0xE000 && val <= 0xFFFD ) ) { rep += char( 224 + ( val >> 12 ) ); rep += char( 128 + ( ( val >> 6 ) % 64 ) ); rep += char( 128 + ( val % 64 ) ); } else if( val >= 0x100000 && val < 0x10FFFF ) { rep += char( 240 + ( val >> 18 ) ); rep += char( 128 + ( ( val >> 12 ) % 64 ) ); rep += char( 128 + ( ( val >> 6 ) % 64 ) ); rep += char( 128 + ( val % 64 ) ); } else return DecodeInvalid; } break; case 'l': if( diff == 3 && data[pos + 2] == 't' ) rep += '<'; else return DecodeInvalid; break; case 'g': if( diff == 3 && data[pos + 2] == 't' ) rep += '>'; else return DecodeInvalid; break; case 'a': if( diff == 5 && !data.compare( pos + 1, 5, "apos;" ) ) rep += '\''; else if( diff == 4 && !data.compare( pos + 1, 4, "amp;" ) ) rep += '&'; else return DecodeInvalid; break; case 'q': if( diff == 5 && !data.compare( pos + 1, 5, "quot;" ) ) rep += '"'; else return DecodeInvalid; break; default: return DecodeInvalid; } switch( m_state ) { case InterTag: case TagInside: m_cdata += rep; break; case TagAttributeValue: m_value += rep; break; default: break; } pos += diff; return DecodeValid; } Parser::ForwardScanState Parser::forwardScan( std::string::size_type& pos, const std::string& data, const std::string& needle ) { if( pos + needle.length() <= data.length() ) { if( !data.compare( pos, needle.length(), needle ) ) { pos += needle.length() - 1; return ForwardFound; } else { return ForwardNotFound; } } else { m_backBuffer = data.substr( pos ); return ForwardInsufficientSize; } } int Parser::feed( std::string& data ) { if( !m_backBuffer.empty() ) { data.insert( 0, m_backBuffer ); m_backBuffer = EmptyString; } std::string::size_type count = data.length(); for( std::string::size_type i = 0; i < count; ++i ) { const unsigned char c = data[i]; // printf( "found char: %c, ", c ); switch( m_state ) { case Initial: // printf( "Initial: %c\n", c ); if( isWhitespace( c ) ) break; switch( c ) { case '<': m_state = TagOpening; break; default: cleanup(); return static_cast( i ); break; } break; case InterTag: // printf( "InterTag: %c\n", c ); m_tag = EmptyString; if( isWhitespace( c ) ) { m_state = TagInside; if( m_current ) m_cdata += c; break; } switch( c ) { case '&': // printf( "InterTag, calling decode\n" ); switch( decode( i, data ) ) { case DecodeValid: m_state = TagInside; break; case DecodeInvalid: cleanup(); return static_cast( i ); case DecodeInsufficient: return -1; } break; case '<': m_state = TagOpening; break; case '>': default: if( m_current ) { m_cdata += c; m_state = TagInside; } break; } break; case TagOpening: // opening '<' has been found before // printf( "TagOpening: %c\n", c ); if( isWhitespace( c ) ) break; switch( c ) { case '<': case '>': case '&': cleanup(); return static_cast( i ); break; case '/': m_state = TagClosingSlash; break; case '?': m_state = TagNameCollect; m_preamble = 1; break; case '!': switch( forwardScan( i, data, "![CDATA[" ) ) { case ForwardFound: m_state = TagCDATASection; break; case ForwardNotFound: cleanup(); return static_cast( i ); case ForwardInsufficientSize: return -1; } break; default: m_tag += c; m_state = TagNameCollect; break; } break; case TagCDATASection: switch( c ) { case ']': switch( forwardScan( i, data, "]]>" ) ) { case ForwardFound: m_state = TagInside; break; case ForwardNotFound: m_cdata += c; break; case ForwardInsufficientSize: return -1; } break; default: m_cdata += c; break; } break; case TagNameCollect: // we're collecting the tag's name, we have at least one octet already // printf( "TagNameCollect: %c\n", c ); if( isWhitespace( c ) ) { m_state = TagNameComplete; break; } switch( c ) { case '<': case '?': case '!': case '&': cleanup(); return static_cast( i ); break; case '/': m_state = TagOpeningSlash; break; case '>': addTag(); m_state = TagInside; break; case ':': if( !m_haveTagPrefix ) { m_haveTagPrefix = true; m_tagPrefix = m_tag; m_tag = EmptyString; } else { cleanup(); return static_cast( i ); } break; default: m_tag += c; break; } break; case TagInside: // we're inside a tag, expecting a child tag or cdata // printf( "TagInside: %c\n", c ); m_tag = EmptyString; switch( c ) { case '<': addCData(); m_state = TagOpening; break; case '&': // printf( "TagInside, calling decode\n" ); switch( decode( i, data ) ) { case DecodeValid: break; case DecodeInvalid: cleanup(); return static_cast( i ); case DecodeInsufficient: return -1; } break; default: m_cdata += c; break; } break; case TagOpeningSlash: // a slash in an opening tag has been found, initing close of the tag // printf( "TagOpeningSlash: %c\n", c ); if( isWhitespace( c ) ) break; if( c == '>' ) { addTag(); if( !closeTag() ) { // printf( "noipe, here\n" ); cleanup(); return static_cast( i ); } m_state = InterTag; } else { cleanup(); return static_cast( i ); } break; case TagClosingSlash: // we have found the '/' of a closing tag // printf( "TagClosingSlash: %c\n", c ); if( isWhitespace( c ) ) break; switch( c ) { case '>': case '<': case '/': cleanup(); return static_cast( i ); break; default: m_tag += c; m_state = TagClosing; break; } break; case TagClosing: // we're collecting the name of a closing tag // printf( "TagClosing: %c\n", c ); switch( c ) { case '<': case '/': case '!': case '?': case '&': cleanup(); return static_cast( i ); break; case ':': if( !m_haveTagPrefix ) { m_haveTagPrefix = true; m_tagPrefix = m_tag; m_tag = EmptyString; } else { cleanup(); return static_cast( i ); } break; case '>': if( !closeTag() ) { // printf( "here\n" ); cleanup(); return static_cast( i ); } m_state = InterTag; break; default: m_tag += c; break; } break; case TagNameComplete: // a tag name is complete, expect tag close or attribs // printf( "TagNameComplete: %c\n", c ); if( isWhitespace( c ) ) break; switch( c ) { case '<': case '!': case '&': cleanup(); return static_cast( i ); break; case '/': m_state = TagOpeningSlash; break; case '>': if( m_preamble == 1 ) { cleanup(); return static_cast( i ); } m_state = TagInside; addTag(); break; case '?': if( m_preamble == 1 ) m_preamble = 2; else { cleanup(); return static_cast( i ); } break; default: m_attrib += c; m_state = TagAttribute; break; } break; case TagAttribute: // we're collecting the name of an attribute, we have at least 1 octet // printf( "TagAttribute: %c\n", c ); if( isWhitespace( c ) ) { m_state = TagAttributeComplete; break; } switch( c ) { case '<': case '/': case '>': case '?': case '!': case '&': cleanup(); return static_cast( i ); break; case '=': m_state = TagAttributeEqual; break; case ':': if( !m_haveAttribPrefix && m_attrib != XMLNS ) { m_haveAttribPrefix = true; m_attribPrefix = m_attrib; m_attrib = EmptyString; } else if( m_attrib == XMLNS ) { m_attribIsXmlns = true; m_attrib = EmptyString; } else { cleanup(); return static_cast( i ); } break; default: m_attrib += c; } break; case TagAttributeComplete: // we're expecting an equals sign or ws // printf( "TagAttributeComplete: %c\n", c ); if( isWhitespace( c ) ) break; switch( c ) { case '=': m_state = TagAttributeEqual; break; default: cleanup(); return static_cast( i ); break; } break; case TagAttributeEqual: // we have found an equals sign // printf( "TagAttributeEqual: %c\n", c ); if( isWhitespace( c ) ) break; switch( c ) { case '"': m_quote = true; case '\'': m_state = TagAttributeValue; break; default: cleanup(); return static_cast( i ); break; } break; case TagAttributeValue: // we're expecting value data // printf( "TagValue: %c\n", c ); switch( c ) { case '<': cleanup(); return static_cast( i ); break; case '\'': if( m_quote ) { m_value += c; break; } case '"': addAttribute(); m_state = TagNameAlmostComplete; m_quote = false; break; case '&': // printf( "TagAttributeValue, calling decode\n" ); switch( decode( i, data ) ) { case DecodeValid: break; case DecodeInvalid: cleanup(); return static_cast( i ); case DecodeInsufficient: return -1; } break; case '>': default: m_value += c; } break; case TagNameAlmostComplete: // printf( "TagAttributeEqual: %c\n", c ); if( isWhitespace( c ) ) { m_state = TagNameComplete; break; } switch( c ) { case '/': m_state = TagOpeningSlash; break; case '>': if( m_preamble == 1 ) { cleanup(); return static_cast( i ); } m_state = TagInside; addTag(); break; case '?': if( m_preamble == 1 ) m_preamble = 2; else { cleanup(); return static_cast( i ); } break; default: cleanup(); return static_cast( i ); break; } break; default: // printf( "default action!?\n" ); break; } // printf( "parser state: %d\n", m_state ); } return -1; } void Parser::addTag() { if( !m_root ) { // printf( "created Tag named %s, ", m_tag.c_str() ); m_root = new Tag( m_tag ); m_current = m_root; } else { // printf( "created Tag named %s, ", m_tag.c_str() ); m_current = new Tag( m_current, m_tag ); } if( m_haveTagPrefix ) { // printf( "setting tag prefix: %s\n", m_tagPrefix.c_str() ); m_current->setPrefix( m_tagPrefix ); m_haveTagPrefix = false; } if( m_attribs.size() ) { m_current->setAttributes( m_attribs ); // printf( "added %d attributes, ", m_attribs.size() ); m_attribs.clear(); } if( m_xmlnss ) { // printf( "have ns decls\n" ); // StringMap::const_iterator it = m_xmlnss->begin(); // for( ; it != m_xmlnss->end(); ++it ) // printf( "%s='%s'\n", (*it).first.c_str(), (*it).second.c_str() ); m_current->setXmlns( m_xmlnss ); m_xmlnss = 0; } m_current->setXmlns( m_xmlns ); m_xmlns = EmptyString; if( m_tag == "stream" && m_root->xmlns() == XMLNS_STREAM ) { streamEvent( m_root ); cleanup( m_deleteRoot ); return; } // else // printf( "%s, ", m_root->xml().c_str() ); if( m_root && m_root == m_current && m_tagPrefix == "stream" ) m_root->setXmlns( XMLNS_STREAM, m_tagPrefix ); if( m_tag == "xml" && m_preamble == 2 ) cleanup(); } void Parser::addAttribute() { Tag::Attribute* attr = new Tag::Attribute( m_attrib, m_value );; if( m_attribIsXmlns ) { if( !m_xmlnss ) m_xmlnss = new StringMap(); (*m_xmlnss)[m_attrib] = m_value; attr->setPrefix( XMLNS ); } else { // printf( "adding attribute: %s:%s='%s'\n", m_attribPrefix.c_str(), m_attrib.c_str(), m_value.c_str() ); if( !m_attribPrefix.empty() ) attr->setPrefix( m_attribPrefix ); if( m_attrib == XMLNS ) m_xmlns = m_value; } m_attribs.push_back( attr ); m_attrib = EmptyString; m_value = EmptyString; m_attribPrefix = EmptyString; m_haveAttribPrefix = false; m_attribIsXmlns = false; } void Parser::addCData() { if( m_current && !m_cdata.empty() ) { m_current->addCData( m_cdata ); // printf( "added cdata %s to %s: %s\n", // m_cdata.c_str(), m_current->name().c_str(), m_current->xml().c_str() ); m_cdata = EmptyString; } } bool Parser::closeTag() { // printf( "about to close, " ); if( m_tag == "stream" && m_tagPrefix == "stream" ) return true; if( !m_current || m_current->name() != m_tag || ( !m_current->prefix().empty() && m_current->prefix() != m_tagPrefix ) ) { // printf( "current xml: %s\n", m_current->xml().c_str() ); // printf( "current name: %s, m_tag: %s\n", m_current->name().c_str(), m_tag.c_str() ); // printf( "current prefix: %s, m_tagPrefix: %s\n", m_current->prefix().c_str(), m_tagPrefix.c_str() ); return false; } // printf( "m_current: %s, ", m_current->name().c_str() ); // printf( "m_tag: %s, ", m_tag.c_str() ); m_tagPrefix = EmptyString; m_haveTagPrefix = false; if( m_current->parent() ) m_current = m_current->parent(); else { // printf( "pushing upstream\n" ); streamEvent( m_root ); cleanup( m_deleteRoot ); } return true; } void Parser::cleanup( bool deleteRoot ) { if( deleteRoot ) delete m_root; m_root = 0; m_current = 0; delete m_xmlnss; m_xmlnss = 0; m_cdata = EmptyString; m_tag = EmptyString; m_attrib = EmptyString; m_attribPrefix = EmptyString; m_tagPrefix = EmptyString; m_haveAttribPrefix = false; m_haveTagPrefix = false; m_value = EmptyString; m_xmlns = EmptyString; util::clearList( m_attribs ); m_attribs.clear(); m_state = Initial; m_preamble = 0; } bool Parser::isWhitespace( unsigned char c ) { return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 ); } void Parser::streamEvent( Tag* tag ) { if( m_tagHandler ) m_tagHandler->handleTag( tag ); } }