From 39d5842e7cbe638b5dd028aefc32de04f3bfe195 Mon Sep 17 00:00:00 2001 From: Jaidyn Ann Date: Sat, 4 Jul 2020 04:44:54 -0500 Subject: [PATCH] Start replacing raptor with tinyxml (finally\!) --- Makefile | 2 +- src/Channel.cpp | 37 ++++- src/Channel.h | 12 +- src/Config.h | 1 + src/Item.cpp | 66 +++++++-- src/Item.h | 19 ++- src/Rifen.cpp | 16 +-- src/Rifen.h | 7 + src/parsing.cpp | 364 +++++++++++------------------------------------- src/parsing.h | 29 ++-- 10 files changed, 213 insertions(+), 340 deletions(-) diff --git a/Makefile b/Makefile index 7d60be2..4873ff0 100644 --- a/Makefile +++ b/Makefile @@ -68,7 +68,7 @@ RSRCS = \ # - if your library does not follow the standard library naming scheme, # you need to specify the path to the library and it's name. # (e.g. for mylib.a, specify "mylib.a" or "path/mylib.a") -LIBS = be tracker shared raptor2 bnetapi network $(STDCPPLIBS) +LIBS = be tracker shared tinyxml2 bnetapi network $(STDCPPLIBS) # Specify additional paths to directories following the standard libXXX.so # or libXXX.a naming scheme. You can specify full paths or paths relative diff --git a/src/Channel.cpp b/src/Channel.cpp index acacbe2..2a26eb7 100644 --- a/src/Channel.cpp +++ b/src/Channel.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include "Channel.h" #include "Item.h" #include "Config.h" @@ -12,6 +12,7 @@ Channel::Channel ( BString path, BString outputPath ) homePage = BString(""); xmlUrl = BString(""); filePath = path; + lastDate = BString(""); topLevelSubject = ""; lastSubject = ""; outputDir = outputPath; @@ -20,8 +21,36 @@ Channel::Channel ( BString path, BString outputPath ) void Channel::Parse ( Config* cfg ) { - int itemCount = countItemParser( filePath.String() ); - items = BList(itemCount); + items = BList(); Channel* chan = this; - feedParser(&chan); + feedParser(&chan, cfg); +} + +void Channel::SetTitle ( const char* titleStr ) { + if ( titleStr != NULL ) title = BString( titleStr ); +} +void Channel::SetTitle ( tinyxml2::XMLElement* elem ) { + if ( elem != NULL ) SetTitle( elem->GetText() ); +} + +void Channel::SetDesc ( const char* descStr ) { + if ( descStr != NULL ) description = BString( descStr ); +} +void Channel::SetDesc ( tinyxml2::XMLElement* elem ) { + if ( elem != NULL ) SetDesc( elem->GetText() ); +} + +void Channel::SetHomePage ( const char* homepageStr ) { + if ( homepageStr != NULL ) + homePage = BString( homepageStr ); +} +void Channel::SetHomePage ( tinyxml2::XMLElement* elem ) { + if ( elem != NULL ) SetHomePage( elem->GetText() ); +} + +void Channel::SetLastDate ( const char* dateStr ) { + if ( dateStr != NULL ) lastDate = BString( dateStr ); +} +void Channel::SetLastDate ( tinyxml2::XMLElement* elem ) { + if ( elem != NULL ) SetLastDate( elem->GetText() ); } diff --git a/src/Channel.h b/src/Channel.h index 3f214de..3a2d64a 100644 --- a/src/Channel.h +++ b/src/Channel.h @@ -1,6 +1,7 @@ #ifndef CHANNEL_H #define CHANNEL_H +#include #include #include #include @@ -12,7 +13,7 @@ public: char lang[3]; BString title; BString description; - BDate lastBuildDate; + BString lastDate; BString homePage; BString xmlUrl; BList items; @@ -27,6 +28,15 @@ public: // Channel ( BEntry ); // Channel ( BUrl ); void Parse ( Config* ); + + void SetTitle ( const char* ); + void SetTitle ( tinyxml2::XMLElement* ); + void SetDesc ( const char* ); + void SetDesc ( tinyxml2::XMLElement* ); + void SetLastDate ( const char* ); + void SetLastDate ( tinyxml2::XMLElement* ); + void SetHomePage ( const char* ); + void SetHomePage ( tinyxml2::XMLElement* ); }; #endif diff --git a/src/Config.h b/src/Config.h index c0fd5b5..5adcfd5 100644 --- a/src/Config.h +++ b/src/Config.h @@ -1,6 +1,7 @@ #ifndef CONFIG_H #define CONFIG_H +#include #include class Config { diff --git a/src/Item.cpp b/src/Item.cpp index a9ee188..66f861d 100644 --- a/src/Item.cpp +++ b/src/Item.cpp @@ -1,30 +1,30 @@ #include #include -#include +#include #include +#include "Config.h" #include "Item.h" -Item::Item ( BString localSubject, BString outputPath ) +Item::Item ( BString outputPath ) { - subject = localSubject; title = BString(""); description = BString(""); homePage = BString(""); postUrl = BString(""); - content = ""; + content = BString(""); pubDate = BString(""); outputDir = outputPath; } bool -Item::Filetize ( bool onlyIfNew = false ) +Item::Filetize ( Config* cfg, bool onlyIfNew = false ) { BDirectory* dir = new BDirectory( outputDir ); BFile* file = new BFile( title.String(), B_READ_WRITE ); dir->CreateFile( title.String(), file ); - BString betype = "text/html"; + BString betype = cfg->mimetype; file->WriteAttr( "META:title", B_STRING_TYPE, 0, title.String(), title.CountChars() ); @@ -37,15 +37,53 @@ Item::Filetize ( bool onlyIfNew = false ) file->WriteAttr( "BEOS:TYPE", B_STRING_TYPE, 0, betype.String(), betype.CountChars() ); + file->Write(content.String(), content.Length()); // using file->Write with content converted to C string messes up length ofc // this is required to preserve length (because of UTF char substitutions in parsing.cpp) - const char* strPath = outputDir.String(); - std::string path(strPath); - path += std::string(title.String()); - std::cout << path << std::endl; - - std::ofstream pFile(path); - pFile << content; - pFile.close(); +// const char* strPath = outputDir.String(); +// std::string path(strPath); +// path += std::string(title.String()); +// std::cout << path << std::endl; +// +// std::ofstream pFile(path); +// pFile << content; +// pFile.close(); return false; } + +void Item::SetTitle ( const char* titleStr ) { + if ( titleStr != NULL ) title = BString( titleStr ); +} +void Item::SetTitle ( tinyxml2::XMLElement* elem ) { + if ( elem != NULL ) SetTitle( elem->GetText() ); +} + +void Item::SetDesc ( const char* descStr ) { + if ( descStr != NULL ) description = BString( descStr ); +} +void Item::SetDesc ( tinyxml2::XMLElement* elem ) { + if ( elem != NULL ) SetDesc( elem->GetText() ); +} + +void Item::SetContent ( const char* contentStr ) { + if ( contentStr != NULL ) content = BString( contentStr ); +} +void Item::SetContent ( tinyxml2::XMLElement* elem ) { + if ( elem != NULL ) SetContent( elem->GetText() ); +} + +void Item::SetPostUrl ( const char* urlStr ) { + if ( urlStr != NULL ) + postUrl = BString( urlStr ); +} +void Item::SetPostUrl ( tinyxml2::XMLElement* elem ) { + if ( elem != NULL ) SetPostUrl( elem->GetText() ); +} + +void Item::SetPubDate ( const char* dateStr ) { + if ( dateStr != NULL ) + pubDate = BString( dateStr ); +} +void Item::SetPubDate ( tinyxml2::XMLElement* elem ) { + if ( elem != NULL ) SetPubDate( elem->GetText() ); +} diff --git a/src/Item.h b/src/Item.h index 6f76a07..e7d7904 100644 --- a/src/Item.h +++ b/src/Item.h @@ -14,14 +14,25 @@ public: BString pubDate; BString homePage; BString postUrl; - std::string content; + BString content; BString outputDir; - BString subject; + Item ( BString ); - Item ( BString, BString ); + bool Filetize ( Config*, bool ); - bool Filetize ( bool ); + void SetTitle ( const char* ); + void SetTitle ( tinyxml2::XMLElement* ); + void SetDesc ( const char* ); + void SetDesc ( tinyxml2::XMLElement* ); + void SetContent ( const char* ); + void SetContent ( tinyxml2::XMLElement* ); + void SetPostUrl ( const char* ); + void SetPostUrl ( tinyxml2::XMLElement* ); + void SetPubDate ( const char* ); + void SetPubDate ( tinyxml2::XMLElement* ); }; + + #endif diff --git a/src/Rifen.cpp b/src/Rifen.cpp index beedb2b..4f6a0ad 100644 --- a/src/Rifen.cpp +++ b/src/Rifen.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -8,6 +7,8 @@ #include "Config.h" #include "Rifen.h" +Config* main_cfg; + int usage () { @@ -18,9 +19,8 @@ usage () bool create_item ( void* item ) { - printf("hi"); Item* itemPtr = (Item*)item; - itemPtr->Filetize( false ); + itemPtr->Filetize( main_cfg, false ); return false; } @@ -78,16 +78,14 @@ invocation ( int argc, char** argv, Config** cfgPtr ) int main ( int argc, char** argv ) { -// -// - Config* cfg = new Config; + main_cfg = new Config; usageMsg.ReplaceAll("%app%", "Rifen"); - invocation( argc, argv, &cfg ); + invocation( argc, argv, &main_cfg ); Channel* chan = (Channel*)malloc( sizeof(Channel) ); - chan = new Channel(cfg->targetFeed, cfg->outDir); - chan->Parse(cfg); + chan = new Channel(main_cfg->targetFeed, main_cfg->outDir); + chan->Parse(main_cfg); BList items = chan->items; items.DoForEach(&create_item); diff --git a/src/Rifen.h b/src/Rifen.h index 8793f2f..4dbb1f1 100644 --- a/src/Rifen.h +++ b/src/Rifen.h @@ -37,5 +37,12 @@ BString usageMsg = "Both -t and -T use the ISO 8601 format for specifying datetimes:\n" " YYYY-MM-DDTHH:MM:SS - 2020-01-01T07:07:07\n" "You can leave out seconds, minutes, or hours, but YMD are required.\n" + "\n" + "NOTE: This message doesn't reflect reality. This is more of a spec of\n" + " what I hope this program will be. As of now -t and -T aren't\n" + " implemented, and running %app% without a file/url free-argument\n" + " is invalid, as the daemon isn't implemented at all. As such,\n" + " -D is also non-functional.\n" + " But it sure can turn an XML feed into files! Lol.\n" ; diff --git a/src/parsing.cpp b/src/parsing.cpp index 93cb90d..db5ab93 100644 --- a/src/parsing.cpp +++ b/src/parsing.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include "Channel.h" #include "Item.h" #include "parsing.h" @@ -9,300 +9,92 @@ // ============================================================================ // PARSERS void -feedParser ( Channel** chanPtr ) +feedParser ( Channel** chanPtr, Config* cfg ) { Channel* chan = *(chanPtr); - raptor_parser* rss_parser = NULL; - raptor_world* world; - world = raptor_new_world(); - unsigned char *uri_string; - raptor_uri *uri, *base_uri; + tinyxml2::XMLDocument xml; + xml.LoadFile( chan->filePath.String() ); - rss_parser = raptor_new_parser( world, "rss-tag-soup" ); - uri_string = raptor_uri_filename_to_uri_string( chan->filePath.String() ); - uri = raptor_new_uri( world, uri_string ); - base_uri = raptor_uri_copy( uri ); - - raptor_parser_set_statement_handler( rss_parser, &chan, feedHandler ); - raptor_parser_parse_file( rss_parser, uri, base_uri ); - - raptor_free_parser( rss_parser ); - raptor_free_uri( base_uri ); - raptor_free_uri( uri ); - raptor_free_memory( uri_string ); - raptor_free_world( world ); + if ( xml.FirstChildElement("rss") ) + rssParser( chanPtr, cfg, &xml ); + else if ( xml.FirstChildElement("feed") ) + printf("has atom\n"); +} + +void +rssParser ( Channel** chanPtr, Config* cfg, tinyxml2::XMLDocument* xml ) +{ + Channel* chan = *(chanPtr); + + tinyxml2::XMLElement* xchan = xml->FirstChildElement("rss")->FirstChildElement("channel"); + + rssRootParse( chanPtr, cfg, xchan ); + rssParseItems( chanPtr, cfg, xchan ); +} + +void +rssRootParse( Channel** chanPtr, Config* cfg, tinyxml2::XMLElement* xchan ) +{ + Channel* chan = *(chanPtr); + + chan->SetTitle( xchan->FirstChildElement("title") ); + chan->SetDesc( xchan->FirstChildElement("description") ); + chan->SetHomePage( xchan->FirstChildElement("link") ); + chan->SetLastDate( xchan->FirstChildElement("lastBuildDate") ); + + if ( cfg->verbose ) + printf("Channel '%s' at '%s':\n", chan->title.String(), chan->homePage.String()); +} + +void +rssItemParse ( Channel** chanPtr, Config* cfg, tinyxml2::XMLElement* xitem ) +{ + Channel* chan = *(chanPtr); + + Item* newItem = (Item*)malloc( sizeof(Item) ); + newItem = new Item( chan->outputDir ); + + newItem->SetTitle( xitem->FirstChildElement("title") ); + newItem->SetDesc( xitem->FirstChildElement("description") ); + newItem->SetPubDate( xitem->FirstChildElement("pubDate") ); + newItem->SetContent( xitem->FirstChildElement("content:encoded") ); + + if (cfg->verbose ) + printf("\t%s\n", newItem->title.String()); + + chan->items.AddItem( newItem ); +} + +void +rssParseItems ( Channel** chanPtr, Config* cfg, tinyxml2::XMLElement* xchan ) +{ + Channel* chan = *(chanPtr); + tinyxml2::XMLElement* xitem; + + xitem = xchan->FirstChildElement("item"); + + int itemCount = xmlCountSiblings( xitem, "item" ); + chan->items = BList(itemCount); + + if ( cfg->verbose ) + printf("\t-%i items-\n", itemCount); + + while ( xitem ) { + rssItemParse( chanPtr, cfg, xitem ); + xitem = xitem->NextSiblingElement("item"); + } } -// ------------------------------------- int -countItemParser ( const char* filePath ) +xmlCountSiblings ( tinyxml2::XMLElement* xsibling, const char* sibling_name ) { - raptor_parser* rss_parser = NULL; - raptor_world* world; - world = raptor_new_world(); + int count = 0; - unsigned char *uri_string; - raptor_uri *uri, *base_uri; - - rss_parser = raptor_new_parser(world, "rss-tag-soup"); - uri_string = raptor_uri_filename_to_uri_string( filePath ); - uri = raptor_new_uri( world, uri_string ); - base_uri = raptor_uri_copy( uri ); - - int* itemCount = (int*)malloc( sizeof(int) ); - *itemCount = 0; - raptor_parser_set_statement_handler( rss_parser, &itemCount, countItemHandler ); - raptor_parser_parse_file( rss_parser, uri, base_uri ); - - free( itemCount ); - raptor_free_parser( rss_parser ); - raptor_free_uri( base_uri ); - raptor_free_uri( uri ); - raptor_free_memory( uri_string ); - raptor_free_world( world ); - - return *(itemCount); -} - -void -printStatementParser ( const char* filePath ) -{ - raptor_parser* rss_parser = NULL; - raptor_world* world; - world = raptor_new_world(); - - unsigned char *uri_string; - raptor_uri *uri, *base_uri; - - rss_parser = raptor_new_parser(world, "rss-tag-soup"); - uri_string = raptor_uri_filename_to_uri_string( filePath ); - uri = raptor_new_uri( world, uri_string ); - base_uri = raptor_uri_copy( uri ); - - raptor_parser_set_statement_handler( rss_parser, NULL, printStatementHandler ); - raptor_parser_parse_file( rss_parser, uri, base_uri ); - - raptor_free_parser( rss_parser ); - raptor_free_uri( base_uri ); - raptor_free_uri( uri ); - raptor_free_memory( uri_string ); - raptor_free_world( world ); -} - - -// ============================================================================ -// HANDLERS -void -feedHandler ( void* user_data, raptor_statement* statement ) -{ - if ( user_data != NULL ) { - Channel** chanPtr = (Channel**)user_data; - handleFeedStatement( chanPtr, statement ); - } -} - -void -countItemHandler ( void* user_data, raptor_statement* statement ) -{ - int** countPtr = ( int** )user_data; - int* count = *(countPtr); - - const char* object = ( const char* )raptor_term_to_string( statement->object ); - const char* predicate = ( const char* )raptor_term_to_string( statement->predicate ); - - if (getPredicateTag(predicate) == "type" - && getPredicateTag(object) == "item") - *count += 1; -} - -void -printStatementHandler ( void* user_data, raptor_statement* statement ) -{ - int** countPtr = (int**)user_data; - int* count = *(countPtr); - - const char* subject = ( const char* )raptor_term_to_string( statement->subject ); - const char* predicate = ( const char* )raptor_term_to_string( statement->predicate ); - const char* object = ( const char* )raptor_term_to_string( statement->object ); - - printf("%s\t-%s\n%.50s\n", subject, predicate, object); -} - -// ---------------------------------------------------------------------------- -// FEEDHANDLER HELPERS -void -handleFeedStatement ( Channel** chanPtr, raptor_statement* statement ) -{ - Channel* chan = *(chanPtr); - const char* cpredicate = (const char*)raptor_term_to_string( statement->predicate ); - const char* csubject = (const char*)raptor_term_to_string( statement->subject ); - const char* cobject = (const char*)raptor_term_to_string( statement->object ); - - BString predicate = BString(cpredicate); - BString subject = BString(csubject); - BString bobject = BString(cobject); - - bobject.ReplaceAll("\\\"","\""); - bobject.ReplaceFirst("\"",""); - bobject.ReplaceLast("\"",""); - - std::string object = unescape(bobject.String()); - - predicate = getPredicateTag( predicate ); - - if ( predicate == "type" && getPredicateTag( object ) == "channel" ) - chan->topLevelSubject = subject; - - if ( subject != chan->topLevelSubject ) -// handleChannelStatement( chanPtr, predicate, object ); -// else - handleItemStatement( chanPtr, subject, predicate, object ); -} - -void -handleChannelStatement ( Channel** chanPtr, BString predicate, BString object ) -{ - Channel* chan = *(chanPtr); -} - -void -handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, std::string object ) -{ - Channel* chan = *(chanPtr); - if ( subject.StartsWith("_:genid") ) - return; - - chan->title = BString("dad"); - - if ( subject != chan->lastSubject ) { - chan->lastSubject = subject; - - Item* newItem = (Item*)malloc( sizeof(Item) ); - newItem = new Item( subject, chan->outputDir ); - - chan->items.AddItem( newItem ); + while ( xsibling ) { + count++; + xsibling = xsibling->NextSiblingElement(sibling_name); } - Item* nowItem = (Item*)chan->items.LastItem(); - - if ( predicate == "title" ) - nowItem->title = BString(object.c_str()); - if ( predicate == "encoded" || predicate == "Atomcontent" ) - nowItem->content = object; - if ( predicate == "description" ) - nowItem->description = BString(object.c_str()); - if ( predicate == "link" || predicate == "Atomlink" ) - nowItem->postUrl = BString(object.c_str()); - if ( predicate == "Atomhref" ) - nowItem->postUrl = BString(object.c_str()); - if ( predicate == "date" || predicate == "Atompublished" ) // 2019-02-18T01:43:43Z - nowItem->pubDate = BString(object.c_str()); - if ( predicate == "pubDate" ) // Sun, 17 Feb 2019 19:43:43 -0600 - nowItem->pubDate = BString(object.c_str()); -} - - -// ============================================================================ -// UTIL -BString -getPredicateTag ( BString spec ) -{ - int32 lastSlash = spec.FindLast( '/' ); - spec.RemoveChars( 0, lastSlash + 1 ); - int32 lastHash = spec.FindLast( '#' ); - spec.RemoveChars( 0, lastHash + 1 ); - spec.RemoveLast( ">" ); - - return spec; -} -BString -getPredicateTag ( const char* spec ) -{ - return getPredicateTag( BString(spec) ); -} -BString -getPredicateTag ( std::string spec ) -{ - return getPredicateTag( spec.c_str() ); -} - -// ---------------------------------------------------------------------------- - -/* What ensues is a terrifying violation of the human form. - * Just atrotious. I deserve to be impaled by by an ice-pick. - * ... something (unfortunately), directly ripped from StackOverflow. - * So when getting a raptor_statement's object, it's a char array filled - * with escaped characters (\U2901, etc). - * I'm really not sure how to best manage this, so SO. - * Thanks remy-lebeau, I owe you. - * https://stackoverflow.com/questions/28534221 */ -std::string -toUtf8 ( uint32_t cp ) -{ - std::string result; - - int count; - if (cp <= 0x007F) - count = 1; - else if (cp <= 0x07FF) - count = 2; - else if (cp <= 0xFFFF) - count = 3; - else if (cp <= 0x10FFFF) - count = 4; - else - return result; // or throw an exception - - result.resize(count); - - if (count > 1) { - for (int i = count-1; i > 0; --i) { - result[i] = (char) (0x80 | (cp & 0x3F)); - cp >>= 6; - } - - for (int i = 0; i < count; ++i) - cp |= (1 << (7-i)); - } - - result[0] = (char) cp; - return result; -} - -std::string -unescape ( std::string str, std::string escape ) -{ - std::string::size_type startIdx = 0; - do - { - startIdx = str.find(escape, startIdx); - if (startIdx == std::string::npos) break; - - std::string::size_type endIdx = str.find_first_not_of("0123456789abcdefABCDEF", - startIdx+2); - if (endIdx == std::string::npos) break; - - std::string tmpStr = str.substr(startIdx+2, endIdx-(startIdx+2)); - std::istringstream iss(tmpStr); - - uint32_t cp; - if (iss >> std::hex >> cp) - { - std::string utf8 = toUtf8(cp); - str.replace(startIdx, 2+tmpStr.length(), utf8); - startIdx += utf8.length(); - } - else - startIdx += 2; - } - while (true); - - return str; -} - -std::string -unescape (const char* str ) -{ - return unescape(std::string( unescape(std::string(str), "\\u") ), "\\U"); + return count; } diff --git a/src/parsing.h b/src/parsing.h index 60081a4..7827b6d 100644 --- a/src/parsing.h +++ b/src/parsing.h @@ -1,28 +1,15 @@ #ifndef PARSE_H #define PARSE_H -#include -#include +#include +#include "Config.h" #include "Channel.h" - -void feedParser (Channel**); -void feedHandler ( void*, raptor_statement* ); -void handleFeedStatement ( Channel**, raptor_statement* ); -void handleChannelStatement ( Channel**, BString, BString ); -void handleItemStatement ( Channel**, BString, BString, std::string ); - -int countItemParser ( const char* ); -void countItemHandler ( void*, raptor_statement* ); - -void printStatementParser ( const char* ); -void printStatementHandler ( void*, raptor_statement* ); - -BString getPredicateTag ( const char* ); -BString getPredicateTag ( BString ); -BString getPredicateTag ( std::string ); -std::string to_utf ( uint32 ); -std::string unescape ( std::string, std::string ); -std::string unescape ( const char* ); +void feedParser ( Channel**, Config* ); +void rssParser ( Channel**, Config*, tinyxml2::XMLDocument* ); +void rssRootParse ( Channel**, Config*, tinyxml2::XMLElement* ); +void rssItemParse ( Channel**, Config*, tinyxml2::XMLElement* ); +void rssParseItems ( Channel**, Config*, tinyxml2::XMLElement* ); +int xmlCountSiblings ( tinyxml2::XMLElement*, const char* ); #endif