From 4d4e6bad789a375e750bb374999e16e429215111 Mon Sep 17 00:00:00 2001 From: Jaidyn Ann Date: Sat, 20 Jun 2020 21:15:34 -0500 Subject: [PATCH] Basic 'filetizing' of feed Channels and Items --- src/Channel.cpp | 3 +- src/Channel.h | 3 +- src/Item.cpp | 44 +++++++++++++- src/Item.h | 11 ++-- src/Rifen.cpp | 33 ++++------- src/parsing.cpp | 154 ++++++++++++++++++++++++++++++++++++++++-------- src/parsing.h | 9 ++- test/xwx.xml | 64 ++++++++++++++++++++ 8 files changed, 263 insertions(+), 58 deletions(-) create mode 100644 test/xwx.xml diff --git a/src/Channel.cpp b/src/Channel.cpp index 5bc88aa..b52d74d 100644 --- a/src/Channel.cpp +++ b/src/Channel.cpp @@ -4,7 +4,7 @@ #include "Item.h" #include "parsing.h" -Channel::Channel ( BString path ) +Channel::Channel ( BString path, BString outputPath ) { title = BString("Untitled Feed"); description = BString("Nondescript, N/A."); @@ -13,6 +13,7 @@ Channel::Channel ( BString path ) filePath = path; topLevelSubject = ""; lastSubject = ""; + outputDir = outputPath; } void diff --git a/src/Channel.h b/src/Channel.h index 1568aaa..aa2f302 100644 --- a/src/Channel.h +++ b/src/Channel.h @@ -19,9 +19,10 @@ public: BString topLevelSubject; BString lastSubject; BString filePath; + BString outputDir; - Channel ( BString ); + Channel ( BString, BString ); // Channel ( BEntry ); // Channel ( BUrl ); void Parse ( void ); diff --git a/src/Item.cpp b/src/Item.cpp index 8e38e66..a9ee188 100644 --- a/src/Item.cpp +++ b/src/Item.cpp @@ -1,13 +1,51 @@ -#include +#include +#include #include +#include #include "Item.h" -Item::Item ( BString localSubject ) +Item::Item ( BString localSubject, BString outputPath ) { subject = localSubject; title = BString(""); description = BString(""); homePage = BString(""); postUrl = BString(""); - content = BString(""); + content = ""; + pubDate = BString(""); + outputDir = outputPath; +} + +bool +Item::Filetize ( bool onlyIfNew = false ) +{ + BDirectory* dir = new BDirectory( outputDir ); + BFile* file = new BFile( title.String(), B_READ_WRITE ); + + dir->CreateFile( title.String(), file ); + + BString betype = "text/html"; + + file->WriteAttr( "META:title", B_STRING_TYPE, 0, + title.String(), title.CountChars() ); + file->WriteAttr( "description", B_STRING_TYPE, 0, + description.String(), description.CountChars() ); + file->WriteAttr( "pubDate", B_STRING_TYPE, 0, + pubDate.String(), pubDate.CountChars() ); + file->WriteAttr( "META:url", B_STRING_TYPE, 0, + postUrl.String(), postUrl.CountChars() ); + file->WriteAttr( "BEOS:TYPE", B_STRING_TYPE, 0, + betype.String(), betype.CountChars() ); + + // using file->Write with content converted to C string messes up length ofc + // this is required to preserve length (because of UTF char substitutions in parsing.cpp) + const char* strPath = outputDir.String(); + std::string path(strPath); + path += std::string(title.String()); + std::cout << path << std::endl; + + std::ofstream pFile(path); + pFile << content; + pFile.close(); + return false; } diff --git a/src/Item.h b/src/Item.h index 0e27f37..6f76a07 100644 --- a/src/Item.h +++ b/src/Item.h @@ -1,6 +1,7 @@ #ifndef ITEM_H #define ITEM_H +#include #include #include #include @@ -10,15 +11,17 @@ class Item { public: BString title; BString description; - BDate pubDate; + BString pubDate; BString homePage; BString postUrl; - BString content; + std::string content; + BString outputDir; BString subject; - void Print ( void ); - Item ( BString ); + Item ( BString, BString ); + + bool Filetize ( bool ); }; #endif diff --git a/src/Rifen.cpp b/src/Rifen.cpp index 762183c..570d6b6 100644 --- a/src/Rifen.cpp +++ b/src/Rifen.cpp @@ -2,40 +2,27 @@ #include #include "Channel.h" #include "Item.h" -#include "parsing.h" +#include "parsing.h" // + bool create_item ( void* item ) { - Item* itemPtr = (Item*)item; - - BDirectory* dir = new BDirectory("./test/test/"); - BFile* file = new BFile(itemPtr->title.String(), B_READ_WRITE); - - dir->CreateFile(itemPtr->title.String(), file); - - file->WriteAttr("title",B_STRING_TYPE,0, - itemPtr->title.String(),itemPtr->title.CountChars()); - file->WriteAttr("description",B_STRING_TYPE,0, - itemPtr->description.String(),itemPtr->description.CountChars()); - -// const char* buf; -// buf = itemPtr->title.String(); - file->Write(itemPtr->title.String(), itemPtr->title.CountChars()); - + Item* itemPtr = (Item*)item; + itemPtr->Filetize( false ); return false; } int main ( int argc, char** argv ) { + BString outputDir("/boot/home/feeds/"); Channel* chan = (Channel*)malloc( sizeof(Channel) ); - chan = new Channel(argv[1]); - chan->Parse(); - BList items = chan->items; - printf("%s\n", chan->title.String()); - items.DoForEach(&create_item); - + chan = new Channel(argv[1], outputDir); + chan->Parse(); + + BList items = chan->items; + items.DoForEach(&create_item); return 0; } diff --git a/src/parsing.cpp b/src/parsing.cpp index b6f5e84..2def08f 100644 --- a/src/parsing.cpp +++ b/src/parsing.cpp @@ -1,3 +1,5 @@ +#include +#include #include #include "Channel.h" #include "Item.h" @@ -20,7 +22,7 @@ feedParser ( Channel** chanPtr ) unsigned char *uri_string; raptor_uri *uri, *base_uri; - rss_parser = raptor_new_parser(world, "rss-tag-soup"); + rss_parser = raptor_new_parser( world, "rss-tag-soup" ); uri_string = raptor_uri_filename_to_uri_string( chan->filePath.String() ); uri = raptor_new_uri( world, uri_string ); base_uri = raptor_uri_copy( uri ); @@ -28,10 +30,10 @@ feedParser ( Channel** chanPtr ) raptor_parser_set_statement_handler( rss_parser, &chan, feedHandler ); raptor_parser_parse_file( rss_parser, uri, base_uri ); - raptor_free_parser(rss_parser); - raptor_free_uri(base_uri); - raptor_free_uri(uri); - raptor_free_memory(uri_string); + raptor_free_parser( rss_parser ); + raptor_free_uri( base_uri ); + raptor_free_uri( uri ); + raptor_free_memory( uri_string ); raptor_free_world( world ); } @@ -56,11 +58,11 @@ countItemParser ( const char* filePath ) raptor_parser_set_statement_handler( rss_parser, &itemCount, countItemHandler ); raptor_parser_parse_file( rss_parser, uri, base_uri ); - free(itemCount); - raptor_free_parser(rss_parser); - raptor_free_uri(base_uri); - raptor_free_uri(uri); - raptor_free_memory(uri_string); + free( itemCount ); + raptor_free_parser( rss_parser ); + raptor_free_uri( base_uri ); + raptor_free_uri( uri ); + raptor_free_memory( uri_string ); raptor_free_world( world ); return *(itemCount); @@ -84,10 +86,10 @@ printStatementParser ( const char* filePath ) raptor_parser_set_statement_handler( rss_parser, NULL, printStatementHandler ); raptor_parser_parse_file( rss_parser, uri, base_uri ); - raptor_free_parser(rss_parser); - raptor_free_uri(base_uri); - raptor_free_uri(uri); - raptor_free_memory(uri_string); + raptor_free_parser( rss_parser ); + raptor_free_uri( base_uri ); + raptor_free_uri( uri ); + raptor_free_memory( uri_string ); raptor_free_world( world ); } @@ -127,7 +129,7 @@ printStatementHandler ( void* user_data, raptor_statement* statement ) const char* predicate = ( const char* )raptor_term_to_string( statement->predicate ); const char* object = ( const char* )raptor_term_to_string( statement->object ); - printf("%s\t-%s\n%.5s\n", subject, predicate, object); + printf("%s\t-%s\n%.50s\n", subject, predicate, object); } // ---------------------------------------------------------------------------- @@ -136,9 +138,20 @@ void handleFeedStatement ( Channel** chanPtr, raptor_statement* statement ) { Channel* chan = *(chanPtr); - BString predicate = BString(( const char* )raptor_term_to_string( statement->predicate )); - BString subject = BString(( const char* )raptor_term_to_string( statement->subject )); - BString object = BString(( const char* )raptor_term_to_string( statement->object )); + const char* cpredicate = (const char*)raptor_term_to_string( statement->predicate ); + const char* csubject = (const char*)raptor_term_to_string( statement->subject ); + const char* cobject = (const char*)raptor_term_to_string( statement->object ); + + BString predicate = BString(cpredicate); + BString subject = BString(csubject); + BString bobject = BString(cobject); + + bobject.ReplaceAll("\\\"","\""); + bobject.ReplaceFirst("\"",""); + bobject.ReplaceLast("\"",""); + + std::string object = unescape(bobject.String()); + predicate = getPredicateTag( predicate ); if ( predicate == "type" && getPredicateTag( object ) == "channel" ) @@ -157,7 +170,7 @@ handleChannelStatement ( Channel** chanPtr, BString predicate, BString object ) } void -handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, BString object ) +handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, std::string object ) { Channel* chan = *(chanPtr); if ( subject.StartsWith("_:genid") ) @@ -169,7 +182,7 @@ handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, BSt chan->lastSubject = subject; Item* newItem = (Item*)malloc( sizeof(Item) ); - newItem = new Item( subject ); + newItem = new Item( subject, chan->outputDir ); chan->items.AddItem( newItem ); } @@ -177,10 +190,19 @@ handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, BSt Item* nowItem = (Item*)chan->items.LastItem(); if ( predicate == "title" ) - nowItem->title = object; - + nowItem->title = BString(object.c_str()); if ( predicate == "encoded" || predicate == "Atomcontent" ) nowItem->content = object; + if ( predicate == "description" ) + nowItem->description = BString(object.c_str()); + if ( predicate == "link" || predicate == "Atomlink" ) + nowItem->postUrl = BString(object.c_str()); + if ( predicate == "Atomhref" ) + nowItem->postUrl = BString(object.c_str()); + if ( predicate == "date" || predicate == "Atompublished" ) // 2019-02-18T01:43:43Z + nowItem->pubDate = BString(object.c_str()); + if ( predicate == "pubDate" ) // Sun, 17 Feb 2019 19:43:43 -0600 + nowItem->pubDate = BString(object.c_str()); } @@ -197,9 +219,93 @@ getPredicateTag ( BString spec ) return spec; } - BString -getPredicateTag ( char* spec ) +getPredicateTag ( const char* spec ) { return getPredicateTag( BString(spec) ); } +BString +getPredicateTag ( std::string spec ) +{ + return getPredicateTag( spec.c_str() ); +} + +// ---------------------------------------------------------------------------- + +/* What ensues is a terrifying violation of the human form. + * Just atrotious. I deserve to be impaled by by an ice-pick. + * ... something (unfortunately), directly ripped from StackOverflow. + * So when getting a raptor_statement's object, it's a char array filled + * with escaped characters (\U2901, etc). + * I'm really not sure how to best manage this, so SO. + * Thanks remy-lebeau, I owe you. + * https://stackoverflow.com/questions/28534221 */ +std::string +toUtf8 ( uint32_t cp ) +{ + std::string result; + + int count; + if (cp <= 0x007F) + count = 1; + else if (cp <= 0x07FF) + count = 2; + else if (cp <= 0xFFFF) + count = 3; + else if (cp <= 0x10FFFF) + count = 4; + else + return result; // or throw an exception + + result.resize(count); + + if (count > 1) { + for (int i = count-1; i > 0; --i) { + result[i] = (char) (0x80 | (cp & 0x3F)); + cp >>= 6; + } + + for (int i = 0; i < count; ++i) + cp |= (1 << (7-i)); + } + + result[0] = (char) cp; + return result; +} + +std::string +unescape ( std::string str, std::string escape ) +{ + std::string::size_type startIdx = 0; + do + { + startIdx = str.find(escape, startIdx); + if (startIdx == std::string::npos) break; + + std::string::size_type endIdx = str.find_first_not_of("0123456789abcdefABCDEF", + startIdx+2); + if (endIdx == std::string::npos) break; + + std::string tmpStr = str.substr(startIdx+2, endIdx-(startIdx+2)); + std::istringstream iss(tmpStr); + + uint32_t cp; + if (iss >> std::hex >> cp) + { + std::string utf8 = toUtf8(cp); + str.replace(startIdx, 2+tmpStr.length(), utf8); + startIdx += utf8.length(); + } + else + startIdx += 2; + } + while (true); + + return str; +} + +std::string +unescape (const char* str ) +{ + return unescape(std::string( unescape(std::string(str), "\\u") ), "\\U"); +} diff --git a/src/parsing.h b/src/parsing.h index e299c7a..60081a4 100644 --- a/src/parsing.h +++ b/src/parsing.h @@ -1,6 +1,7 @@ #ifndef PARSE_H #define PARSE_H +#include #include #include "Channel.h" @@ -9,7 +10,7 @@ void feedParser (Channel**); void feedHandler ( void*, raptor_statement* ); void handleFeedStatement ( Channel**, raptor_statement* ); void handleChannelStatement ( Channel**, BString, BString ); -void handleItemStatement ( Channel**, BString, BString, BString ); +void handleItemStatement ( Channel**, BString, BString, std::string ); int countItemParser ( const char* ); void countItemHandler ( void*, raptor_statement* ); @@ -17,7 +18,11 @@ void countItemHandler ( void*, raptor_statement* ); void printStatementParser ( const char* ); void printStatementHandler ( void*, raptor_statement* ); -BString getPredicateTag ( char* ); +BString getPredicateTag ( const char* ); BString getPredicateTag ( BString ); +BString getPredicateTag ( std::string ); +std::string to_utf ( uint32 ); +std::string unescape ( std::string, std::string ); +std::string unescape ( const char* ); #endif diff --git a/test/xwx.xml b/test/xwx.xml new file mode 100644 index 0000000..af122f0 --- /dev/null +++ b/test/xwx.xml @@ -0,0 +1,64 @@ + + + + galactic station xwx + http://localhost:8000 + + + + La Haiku Funkcisistemo k Esperanto + http://localhost:8000../lib/haiku-k-esperanto.html + + Sat, 9 May 2020 01:27:32 -0600 + + + Preter Vim: hoj Kakoune! + http://localhost:8000../lib/preter-vim-al-kak.html + + Thu, 02 Jan 2019 00:05:20 -0600 + + + New domain - Novjaro k novnom + http://localhost:8000../lib/nova-retejnomo.html + + Web, 1 Jan 2020 19:43:43 -0600 + + + Trans la Rivera Lavejo + http://localhost:8000../lib/trans-la-rivero.html + + Wed, 13 Nov 2019 13:55:23 -0600 + + + Arteco kaj Derivfikcio + http://localhost:8000../lib/artec-kaj-fanatikfikci.html + + Sat, 2 Nov 2019 00:52:44 -0600 + + + Universalismo kaj fikciaj bestaĉoj + http://localhost:8000../lib/universalismo-kaj-ficiaj-bestaĉoj.html + + Mon, 15 Jul 2019 22:05:32 -0600 + + + Project Diva f with English subs + http://localhost:8000../lib/project-diva-f-better-english.html + + Wed, 26 Jun 2019 22:37:56 -0600 + + + SBCL k plibonigita terminalo + http://localhost:8000../lib/sbcl-k-plibonigita-terminalo.html + + Wed, 19 Jun 2019 20:21:01 -0600 + + + Cowsay and Rewarding HTML + http://localhost:8000../lib/cowsay-and-html.html + + Sun, 17 Feb 2019 19:43:43 -0600 + + + +