Start replacing raptor with tinyxml (finally\!)

This commit is contained in:
Jaidyn Ann 2020-07-04 04:44:54 -05:00
parent 52908aef68
commit 39d5842e7c
10 changed files with 213 additions and 340 deletions

View File

@ -68,7 +68,7 @@ RSRCS = \
# - if your library does not follow the standard library naming scheme,
# you need to specify the path to the library and it's name.
# (e.g. for mylib.a, specify "mylib.a" or "path/mylib.a")
LIBS = be tracker shared raptor2 bnetapi network $(STDCPPLIBS)
LIBS = be tracker shared tinyxml2 bnetapi network $(STDCPPLIBS)
# Specify additional paths to directories following the standard libXXX.so
# or libXXX.a naming scheme. You can specify full paths or paths relative

View File

@ -1,5 +1,5 @@
#include <cstdio>
#include <raptor2/raptor2.h>
#include <tinyxml2.h>
#include "Channel.h"
#include "Item.h"
#include "Config.h"
@ -12,6 +12,7 @@ Channel::Channel ( BString path, BString outputPath )
homePage = BString("");
xmlUrl = BString("");
filePath = path;
lastDate = BString("");
topLevelSubject = "";
lastSubject = "";
outputDir = outputPath;
@ -20,8 +21,36 @@ Channel::Channel ( BString path, BString outputPath )
void
Channel::Parse ( Config* cfg )
{
int itemCount = countItemParser( filePath.String() );
items = BList(itemCount);
items = BList();
Channel* chan = this;
feedParser(&chan);
feedParser(&chan, cfg);
}
void Channel::SetTitle ( const char* titleStr ) {
if ( titleStr != NULL ) title = BString( titleStr );
}
void Channel::SetTitle ( tinyxml2::XMLElement* elem ) {
if ( elem != NULL ) SetTitle( elem->GetText() );
}
void Channel::SetDesc ( const char* descStr ) {
if ( descStr != NULL ) description = BString( descStr );
}
void Channel::SetDesc ( tinyxml2::XMLElement* elem ) {
if ( elem != NULL ) SetDesc( elem->GetText() );
}
void Channel::SetHomePage ( const char* homepageStr ) {
if ( homepageStr != NULL )
homePage = BString( homepageStr );
}
void Channel::SetHomePage ( tinyxml2::XMLElement* elem ) {
if ( elem != NULL ) SetHomePage( elem->GetText() );
}
void Channel::SetLastDate ( const char* dateStr ) {
if ( dateStr != NULL ) lastDate = BString( dateStr );
}
void Channel::SetLastDate ( tinyxml2::XMLElement* elem ) {
if ( elem != NULL ) SetLastDate( elem->GetText() );
}

View File

@ -1,6 +1,7 @@
#ifndef CHANNEL_H
#define CHANNEL_H
#include <tinyxml2.h>
#include <DateTime.h>
#include <String.h>
#include <List.h>
@ -12,7 +13,7 @@ public:
char lang[3];
BString title;
BString description;
BDate lastBuildDate;
BString lastDate;
BString homePage;
BString xmlUrl;
BList items;
@ -27,6 +28,15 @@ public:
// Channel ( BEntry );
// Channel ( BUrl );
void Parse ( Config* );
void SetTitle ( const char* );
void SetTitle ( tinyxml2::XMLElement* );
void SetDesc ( const char* );
void SetDesc ( tinyxml2::XMLElement* );
void SetLastDate ( const char* );
void SetLastDate ( tinyxml2::XMLElement* );
void SetHomePage ( const char* );
void SetHomePage ( tinyxml2::XMLElement* );
};
#endif

View File

@ -1,6 +1,7 @@
#ifndef CONFIG_H
#define CONFIG_H
#include <String.h>
#include <StorageKit.h>
class Config {

View File

@ -1,30 +1,30 @@
#include <iostream>
#include <fstream>
#include <raptor2/raptor2.h>
#include <tinyxml2.h>
#include <StorageKit.h>
#include "Config.h"
#include "Item.h"
Item::Item ( BString localSubject, BString outputPath )
Item::Item ( BString outputPath )
{
subject = localSubject;
title = BString("");
description = BString("");
homePage = BString("");
postUrl = BString("");
content = "";
content = BString("");
pubDate = BString("");
outputDir = outputPath;
}
bool
Item::Filetize ( bool onlyIfNew = false )
Item::Filetize ( Config* cfg, bool onlyIfNew = false )
{
BDirectory* dir = new BDirectory( outputDir );
BFile* file = new BFile( title.String(), B_READ_WRITE );
dir->CreateFile( title.String(), file );
BString betype = "text/html";
BString betype = cfg->mimetype;
file->WriteAttr( "META:title", B_STRING_TYPE, 0,
title.String(), title.CountChars() );
@ -37,15 +37,53 @@ Item::Filetize ( bool onlyIfNew = false )
file->WriteAttr( "BEOS:TYPE", B_STRING_TYPE, 0,
betype.String(), betype.CountChars() );
file->Write(content.String(), content.Length());
// using file->Write with content converted to C string messes up length ofc
// this is required to preserve length (because of UTF char substitutions in parsing.cpp)
const char* strPath = outputDir.String();
std::string path(strPath);
path += std::string(title.String());
std::cout << path << std::endl;
std::ofstream pFile(path);
pFile << content;
pFile.close();
// const char* strPath = outputDir.String();
// std::string path(strPath);
// path += std::string(title.String());
// std::cout << path << std::endl;
//
// std::ofstream pFile(path);
// pFile << content;
// pFile.close();
return false;
}
void Item::SetTitle ( const char* titleStr ) {
if ( titleStr != NULL ) title = BString( titleStr );
}
void Item::SetTitle ( tinyxml2::XMLElement* elem ) {
if ( elem != NULL ) SetTitle( elem->GetText() );
}
void Item::SetDesc ( const char* descStr ) {
if ( descStr != NULL ) description = BString( descStr );
}
void Item::SetDesc ( tinyxml2::XMLElement* elem ) {
if ( elem != NULL ) SetDesc( elem->GetText() );
}
void Item::SetContent ( const char* contentStr ) {
if ( contentStr != NULL ) content = BString( contentStr );
}
void Item::SetContent ( tinyxml2::XMLElement* elem ) {
if ( elem != NULL ) SetContent( elem->GetText() );
}
void Item::SetPostUrl ( const char* urlStr ) {
if ( urlStr != NULL )
postUrl = BString( urlStr );
}
void Item::SetPostUrl ( tinyxml2::XMLElement* elem ) {
if ( elem != NULL ) SetPostUrl( elem->GetText() );
}
void Item::SetPubDate ( const char* dateStr ) {
if ( dateStr != NULL )
pubDate = BString( dateStr );
}
void Item::SetPubDate ( tinyxml2::XMLElement* elem ) {
if ( elem != NULL ) SetPubDate( elem->GetText() );
}

View File

@ -14,14 +14,25 @@ public:
BString pubDate;
BString homePage;
BString postUrl;
std::string content;
BString content;
BString outputDir;
BString subject;
Item ( BString );
Item ( BString, BString );
bool Filetize ( Config*, bool );
bool Filetize ( bool );
void SetTitle ( const char* );
void SetTitle ( tinyxml2::XMLElement* );
void SetDesc ( const char* );
void SetDesc ( tinyxml2::XMLElement* );
void SetContent ( const char* );
void SetContent ( tinyxml2::XMLElement* );
void SetPostUrl ( const char* );
void SetPostUrl ( tinyxml2::XMLElement* );
void SetPubDate ( const char* );
void SetPubDate ( tinyxml2::XMLElement* );
};
#endif

View File

@ -1,4 +1,3 @@
#include <raptor2/raptor2.h>
#include <StorageKit.h>
#include <String.h>
#include <getopt.h>
@ -8,6 +7,8 @@
#include "Config.h"
#include "Rifen.h"
Config* main_cfg;
int
usage ()
{
@ -18,9 +19,8 @@ usage ()
bool
create_item ( void* item )
{
printf("hi");
Item* itemPtr = (Item*)item;
itemPtr->Filetize( false );
itemPtr->Filetize( main_cfg, false );
return false;
}
@ -78,16 +78,14 @@ invocation ( int argc, char** argv, Config** cfgPtr )
int
main ( int argc, char** argv )
{
//
//
Config* cfg = new Config;
main_cfg = new Config;
usageMsg.ReplaceAll("%app%", "Rifen");
invocation( argc, argv, &cfg );
invocation( argc, argv, &main_cfg );
Channel* chan = (Channel*)malloc( sizeof(Channel) );
chan = new Channel(cfg->targetFeed, cfg->outDir);
chan->Parse(cfg);
chan = new Channel(main_cfg->targetFeed, main_cfg->outDir);
chan->Parse(main_cfg);
BList items = chan->items;
items.DoForEach(&create_item);

View File

@ -37,5 +37,12 @@ BString usageMsg =
"Both -t and -T use the ISO 8601 format for specifying datetimes:\n"
" YYYY-MM-DDTHH:MM:SS - 2020-01-01T07:07:07\n"
"You can leave out seconds, minutes, or hours, but YMD are required.\n"
"\n"
"NOTE: This message doesn't reflect reality. This is more of a spec of\n"
" what I hope this program will be. As of now -t and -T aren't\n"
" implemented, and running %app% without a file/url free-argument\n"
" is invalid, as the daemon isn't implemented at all. As such,\n"
" -D is also non-functional.\n"
" But it sure can turn an XML feed into files! Lol.\n"
;

View File

@ -1,6 +1,6 @@
#include <iostream>
#include <sstream>
#include <raptor2/raptor2.h>
#include <tinyxml2.h>
#include "Channel.h"
#include "Item.h"
#include "parsing.h"
@ -9,300 +9,92 @@
// ============================================================================
// PARSERS
void
feedParser ( Channel** chanPtr )
feedParser ( Channel** chanPtr, Config* cfg )
{
Channel* chan = *(chanPtr);
raptor_parser* rss_parser = NULL;
raptor_world* world;
world = raptor_new_world();
unsigned char *uri_string;
raptor_uri *uri, *base_uri;
tinyxml2::XMLDocument xml;
xml.LoadFile( chan->filePath.String() );
rss_parser = raptor_new_parser( world, "rss-tag-soup" );
uri_string = raptor_uri_filename_to_uri_string( chan->filePath.String() );
uri = raptor_new_uri( world, uri_string );
base_uri = raptor_uri_copy( uri );
raptor_parser_set_statement_handler( rss_parser, &chan, feedHandler );
raptor_parser_parse_file( rss_parser, uri, base_uri );
raptor_free_parser( rss_parser );
raptor_free_uri( base_uri );
raptor_free_uri( uri );
raptor_free_memory( uri_string );
raptor_free_world( world );
}
// -------------------------------------
int
countItemParser ( const char* filePath )
{
raptor_parser* rss_parser = NULL;
raptor_world* world;
world = raptor_new_world();
unsigned char *uri_string;
raptor_uri *uri, *base_uri;
rss_parser = raptor_new_parser(world, "rss-tag-soup");
uri_string = raptor_uri_filename_to_uri_string( filePath );
uri = raptor_new_uri( world, uri_string );
base_uri = raptor_uri_copy( uri );
int* itemCount = (int*)malloc( sizeof(int) );
*itemCount = 0;
raptor_parser_set_statement_handler( rss_parser, &itemCount, countItemHandler );
raptor_parser_parse_file( rss_parser, uri, base_uri );
free( itemCount );
raptor_free_parser( rss_parser );
raptor_free_uri( base_uri );
raptor_free_uri( uri );
raptor_free_memory( uri_string );
raptor_free_world( world );
return *(itemCount);
if ( xml.FirstChildElement("rss") )
rssParser( chanPtr, cfg, &xml );
else if ( xml.FirstChildElement("feed") )
printf("has atom\n");
}
void
printStatementParser ( const char* filePath )
{
raptor_parser* rss_parser = NULL;
raptor_world* world;
world = raptor_new_world();
unsigned char *uri_string;
raptor_uri *uri, *base_uri;
rss_parser = raptor_new_parser(world, "rss-tag-soup");
uri_string = raptor_uri_filename_to_uri_string( filePath );
uri = raptor_new_uri( world, uri_string );
base_uri = raptor_uri_copy( uri );
raptor_parser_set_statement_handler( rss_parser, NULL, printStatementHandler );
raptor_parser_parse_file( rss_parser, uri, base_uri );
raptor_free_parser( rss_parser );
raptor_free_uri( base_uri );
raptor_free_uri( uri );
raptor_free_memory( uri_string );
raptor_free_world( world );
}
// ============================================================================
// HANDLERS
void
feedHandler ( void* user_data, raptor_statement* statement )
{
if ( user_data != NULL ) {
Channel** chanPtr = (Channel**)user_data;
handleFeedStatement( chanPtr, statement );
}
}
void
countItemHandler ( void* user_data, raptor_statement* statement )
{
int** countPtr = ( int** )user_data;
int* count = *(countPtr);
const char* object = ( const char* )raptor_term_to_string( statement->object );
const char* predicate = ( const char* )raptor_term_to_string( statement->predicate );
if (getPredicateTag(predicate) == "type"
&& getPredicateTag(object) == "item")
*count += 1;
}
void
printStatementHandler ( void* user_data, raptor_statement* statement )
{
int** countPtr = (int**)user_data;
int* count = *(countPtr);
const char* subject = ( const char* )raptor_term_to_string( statement->subject );
const char* predicate = ( const char* )raptor_term_to_string( statement->predicate );
const char* object = ( const char* )raptor_term_to_string( statement->object );
printf("%s\t-%s\n%.50s\n", subject, predicate, object);
}
// ----------------------------------------------------------------------------
// FEEDHANDLER HELPERS
void
handleFeedStatement ( Channel** chanPtr, raptor_statement* statement )
rssParser ( Channel** chanPtr, Config* cfg, tinyxml2::XMLDocument* xml )
{
Channel* chan = *(chanPtr);
const char* cpredicate = (const char*)raptor_term_to_string( statement->predicate );
const char* csubject = (const char*)raptor_term_to_string( statement->subject );
const char* cobject = (const char*)raptor_term_to_string( statement->object );
BString predicate = BString(cpredicate);
BString subject = BString(csubject);
BString bobject = BString(cobject);
tinyxml2::XMLElement* xchan = xml->FirstChildElement("rss")->FirstChildElement("channel");
bobject.ReplaceAll("\\\"","\"");
bobject.ReplaceFirst("\"","");
bobject.ReplaceLast("\"","");
std::string object = unescape(bobject.String());
predicate = getPredicateTag( predicate );
if ( predicate == "type" && getPredicateTag( object ) == "channel" )
chan->topLevelSubject = subject;
if ( subject != chan->topLevelSubject )
// handleChannelStatement( chanPtr, predicate, object );
// else
handleItemStatement( chanPtr, subject, predicate, object );
rssRootParse( chanPtr, cfg, xchan );
rssParseItems( chanPtr, cfg, xchan );
}
void
handleChannelStatement ( Channel** chanPtr, BString predicate, BString object )
rssRootParse( Channel** chanPtr, Config* cfg, tinyxml2::XMLElement* xchan )
{
Channel* chan = *(chanPtr);
chan->SetTitle( xchan->FirstChildElement("title") );
chan->SetDesc( xchan->FirstChildElement("description") );
chan->SetHomePage( xchan->FirstChildElement("link") );
chan->SetLastDate( xchan->FirstChildElement("lastBuildDate") );
if ( cfg->verbose )
printf("Channel '%s' at '%s':\n", chan->title.String(), chan->homePage.String());
}
void
handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, std::string object )
rssItemParse ( Channel** chanPtr, Config* cfg, tinyxml2::XMLElement* xitem )
{
Channel* chan = *(chanPtr);
if ( subject.StartsWith("_:genid") )
return;
chan->title = BString("dad");
if ( subject != chan->lastSubject ) {
chan->lastSubject = subject;
Item* newItem = (Item*)malloc( sizeof(Item) );
newItem = new Item( subject, chan->outputDir );
newItem = new Item( chan->outputDir );
newItem->SetTitle( xitem->FirstChildElement("title") );
newItem->SetDesc( xitem->FirstChildElement("description") );
newItem->SetPubDate( xitem->FirstChildElement("pubDate") );
newItem->SetContent( xitem->FirstChildElement("content:encoded") );
if (cfg->verbose )
printf("\t%s\n", newItem->title.String());
chan->items.AddItem( newItem );
}
void
rssParseItems ( Channel** chanPtr, Config* cfg, tinyxml2::XMLElement* xchan )
{
Channel* chan = *(chanPtr);
tinyxml2::XMLElement* xitem;
xitem = xchan->FirstChildElement("item");
int itemCount = xmlCountSiblings( xitem, "item" );
chan->items = BList(itemCount);
if ( cfg->verbose )
printf("\t-%i items-\n", itemCount);
while ( xitem ) {
rssItemParse( chanPtr, cfg, xitem );
xitem = xitem->NextSiblingElement("item");
}
}
int
xmlCountSiblings ( tinyxml2::XMLElement* xsibling, const char* sibling_name )
{
int count = 0;
while ( xsibling ) {
count++;
xsibling = xsibling->NextSiblingElement(sibling_name);
}
Item* nowItem = (Item*)chan->items.LastItem();
if ( predicate == "title" )
nowItem->title = BString(object.c_str());
if ( predicate == "encoded" || predicate == "Atomcontent" )
nowItem->content = object;
if ( predicate == "description" )
nowItem->description = BString(object.c_str());
if ( predicate == "link" || predicate == "Atomlink" )
nowItem->postUrl = BString(object.c_str());
if ( predicate == "Atomhref" )
nowItem->postUrl = BString(object.c_str());
if ( predicate == "date" || predicate == "Atompublished" ) // 2019-02-18T01:43:43Z
nowItem->pubDate = BString(object.c_str());
if ( predicate == "pubDate" ) // Sun, 17 Feb 2019 19:43:43 -0600
nowItem->pubDate = BString(object.c_str());
}
// ============================================================================
// UTIL
BString
getPredicateTag ( BString spec )
{
int32 lastSlash = spec.FindLast( '/' );
spec.RemoveChars( 0, lastSlash + 1 );
int32 lastHash = spec.FindLast( '#' );
spec.RemoveChars( 0, lastHash + 1 );
spec.RemoveLast( ">" );
return spec;
}
BString
getPredicateTag ( const char* spec )
{
return getPredicateTag( BString(spec) );
}
BString
getPredicateTag ( std::string spec )
{
return getPredicateTag( spec.c_str() );
}
// ----------------------------------------------------------------------------
/* What ensues is a terrifying violation of the human form.
* Just atrotious. I deserve to be impaled by by an ice-pick.
* ... something (unfortunately), directly ripped from StackOverflow.
* So when getting a raptor_statement's object, it's a char array filled
* with escaped characters (\U2901, etc).
* I'm really not sure how to best manage this, so SO.
* Thanks remy-lebeau, I owe you.
* https://stackoverflow.com/questions/28534221 */
std::string
toUtf8 ( uint32_t cp )
{
std::string result;
int count;
if (cp <= 0x007F)
count = 1;
else if (cp <= 0x07FF)
count = 2;
else if (cp <= 0xFFFF)
count = 3;
else if (cp <= 0x10FFFF)
count = 4;
else
return result; // or throw an exception
result.resize(count);
if (count > 1) {
for (int i = count-1; i > 0; --i) {
result[i] = (char) (0x80 | (cp & 0x3F));
cp >>= 6;
}
for (int i = 0; i < count; ++i)
cp |= (1 << (7-i));
}
result[0] = (char) cp;
return result;
}
std::string
unescape ( std::string str, std::string escape )
{
std::string::size_type startIdx = 0;
do
{
startIdx = str.find(escape, startIdx);
if (startIdx == std::string::npos) break;
std::string::size_type endIdx = str.find_first_not_of("0123456789abcdefABCDEF",
startIdx+2);
if (endIdx == std::string::npos) break;
std::string tmpStr = str.substr(startIdx+2, endIdx-(startIdx+2));
std::istringstream iss(tmpStr);
uint32_t cp;
if (iss >> std::hex >> cp)
{
std::string utf8 = toUtf8(cp);
str.replace(startIdx, 2+tmpStr.length(), utf8);
startIdx += utf8.length();
}
else
startIdx += 2;
}
while (true);
return str;
}
std::string
unescape (const char* str )
{
return unescape(std::string( unescape(std::string(str), "\\u") ), "\\U");
return count;
}

View File

@ -1,28 +1,15 @@
#ifndef PARSE_H
#define PARSE_H
#include <iostream>
#include <raptor2/raptor2.h>
#include <tinyxml2.h>
#include "Config.h"
#include "Channel.h"
void feedParser (Channel**);
void feedHandler ( void*, raptor_statement* );
void handleFeedStatement ( Channel**, raptor_statement* );
void handleChannelStatement ( Channel**, BString, BString );
void handleItemStatement ( Channel**, BString, BString, std::string );
int countItemParser ( const char* );
void countItemHandler ( void*, raptor_statement* );
void printStatementParser ( const char* );
void printStatementHandler ( void*, raptor_statement* );
BString getPredicateTag ( const char* );
BString getPredicateTag ( BString );
BString getPredicateTag ( std::string );
std::string to_utf ( uint32 );
std::string unescape ( std::string, std::string );
std::string unescape ( const char* );
void feedParser ( Channel**, Config* );
void rssParser ( Channel**, Config*, tinyxml2::XMLDocument* );
void rssRootParse ( Channel**, Config*, tinyxml2::XMLElement* );
void rssItemParse ( Channel**, Config*, tinyxml2::XMLElement* );
void rssParseItems ( Channel**, Config*, tinyxml2::XMLElement* );
int xmlCountSiblings ( tinyxml2::XMLElement*, const char* );
#endif