Basic 'filetizing' of feed Channels and Items

This commit is contained in:
Jaidyn Ann 2020-06-20 21:15:34 -05:00
parent 00c42a860c
commit 4d4e6bad78
8 changed files with 263 additions and 58 deletions

View File

@ -4,7 +4,7 @@
#include "Item.h"
#include "parsing.h"
Channel::Channel ( BString path )
Channel::Channel ( BString path, BString outputPath )
{
title = BString("Untitled Feed");
description = BString("Nondescript, N/A.");
@ -13,6 +13,7 @@ Channel::Channel ( BString path )
filePath = path;
topLevelSubject = "";
lastSubject = "";
outputDir = outputPath;
}
void

View File

@ -19,9 +19,10 @@ public:
BString topLevelSubject;
BString lastSubject;
BString filePath;
BString outputDir;
Channel ( BString );
Channel ( BString, BString );
// Channel ( BEntry );
// Channel ( BUrl );
void Parse ( void );

View File

@ -1,13 +1,51 @@
#include <cstdio>
#include <iostream>
#include <fstream>
#include <raptor2/raptor2.h>
#include <StorageKit.h>
#include "Item.h"
Item::Item ( BString localSubject )
Item::Item ( BString localSubject, BString outputPath )
{
subject = localSubject;
title = BString("");
description = BString("");
homePage = BString("");
postUrl = BString("");
content = BString("");
content = "";
pubDate = BString("");
outputDir = outputPath;
}
bool
Item::Filetize ( bool onlyIfNew = false )
{
BDirectory* dir = new BDirectory( outputDir );
BFile* file = new BFile( title.String(), B_READ_WRITE );
dir->CreateFile( title.String(), file );
BString betype = "text/html";
file->WriteAttr( "META:title", B_STRING_TYPE, 0,
title.String(), title.CountChars() );
file->WriteAttr( "description", B_STRING_TYPE, 0,
description.String(), description.CountChars() );
file->WriteAttr( "pubDate", B_STRING_TYPE, 0,
pubDate.String(), pubDate.CountChars() );
file->WriteAttr( "META:url", B_STRING_TYPE, 0,
postUrl.String(), postUrl.CountChars() );
file->WriteAttr( "BEOS:TYPE", B_STRING_TYPE, 0,
betype.String(), betype.CountChars() );
// using file->Write with content converted to C string messes up length ofc
// this is required to preserve length (because of UTF char substitutions in parsing.cpp)
const char* strPath = outputDir.String();
std::string path(strPath);
path += std::string(title.String());
std::cout << path << std::endl;
std::ofstream pFile(path);
pFile << content;
pFile.close();
return false;
}

View File

@ -1,6 +1,7 @@
#ifndef ITEM_H
#define ITEM_H
#include <iostream>
#include <DateTime.h>
#include <String.h>
#include <List.h>
@ -10,15 +11,17 @@ class Item {
public:
BString title;
BString description;
BDate pubDate;
BString pubDate;
BString homePage;
BString postUrl;
BString content;
std::string content;
BString outputDir;
BString subject;
void Print ( void );
Item ( BString );
Item ( BString, BString );
bool Filetize ( bool );
};
#endif

View File

@ -2,40 +2,27 @@
#include <StorageKit.h>
#include "Channel.h"
#include "Item.h"
#include "parsing.h"
#include "parsing.h" //
bool
create_item ( void* item )
{
Item* itemPtr = (Item*)item;
BDirectory* dir = new BDirectory("./test/test/");
BFile* file = new BFile(itemPtr->title.String(), B_READ_WRITE);
dir->CreateFile(itemPtr->title.String(), file);
file->WriteAttr("title",B_STRING_TYPE,0,
itemPtr->title.String(),itemPtr->title.CountChars());
file->WriteAttr("description",B_STRING_TYPE,0,
itemPtr->description.String(),itemPtr->description.CountChars());
// const char* buf;
// buf = itemPtr->title.String();
file->Write(itemPtr->title.String(), itemPtr->title.CountChars());
Item* itemPtr = (Item*)item;
itemPtr->Filetize( false );
return false;
}
int
main ( int argc, char** argv )
{
BString outputDir("/boot/home/feeds/");
Channel* chan = (Channel*)malloc( sizeof(Channel) );
chan = new Channel(argv[1]);
chan->Parse();
BList items = chan->items;
printf("%s\n", chan->title.String());
items.DoForEach(&create_item);
chan = new Channel(argv[1], outputDir);
chan->Parse();
BList items = chan->items;
items.DoForEach(&create_item);
return 0;
}

View File

@ -1,3 +1,5 @@
#include <iostream>
#include <sstream>
#include <raptor2/raptor2.h>
#include "Channel.h"
#include "Item.h"
@ -20,7 +22,7 @@ feedParser ( Channel** chanPtr )
unsigned char *uri_string;
raptor_uri *uri, *base_uri;
rss_parser = raptor_new_parser(world, "rss-tag-soup");
rss_parser = raptor_new_parser( world, "rss-tag-soup" );
uri_string = raptor_uri_filename_to_uri_string( chan->filePath.String() );
uri = raptor_new_uri( world, uri_string );
base_uri = raptor_uri_copy( uri );
@ -28,10 +30,10 @@ feedParser ( Channel** chanPtr )
raptor_parser_set_statement_handler( rss_parser, &chan, feedHandler );
raptor_parser_parse_file( rss_parser, uri, base_uri );
raptor_free_parser(rss_parser);
raptor_free_uri(base_uri);
raptor_free_uri(uri);
raptor_free_memory(uri_string);
raptor_free_parser( rss_parser );
raptor_free_uri( base_uri );
raptor_free_uri( uri );
raptor_free_memory( uri_string );
raptor_free_world( world );
}
@ -56,11 +58,11 @@ countItemParser ( const char* filePath )
raptor_parser_set_statement_handler( rss_parser, &itemCount, countItemHandler );
raptor_parser_parse_file( rss_parser, uri, base_uri );
free(itemCount);
raptor_free_parser(rss_parser);
raptor_free_uri(base_uri);
raptor_free_uri(uri);
raptor_free_memory(uri_string);
free( itemCount );
raptor_free_parser( rss_parser );
raptor_free_uri( base_uri );
raptor_free_uri( uri );
raptor_free_memory( uri_string );
raptor_free_world( world );
return *(itemCount);
@ -84,10 +86,10 @@ printStatementParser ( const char* filePath )
raptor_parser_set_statement_handler( rss_parser, NULL, printStatementHandler );
raptor_parser_parse_file( rss_parser, uri, base_uri );
raptor_free_parser(rss_parser);
raptor_free_uri(base_uri);
raptor_free_uri(uri);
raptor_free_memory(uri_string);
raptor_free_parser( rss_parser );
raptor_free_uri( base_uri );
raptor_free_uri( uri );
raptor_free_memory( uri_string );
raptor_free_world( world );
}
@ -127,7 +129,7 @@ printStatementHandler ( void* user_data, raptor_statement* statement )
const char* predicate = ( const char* )raptor_term_to_string( statement->predicate );
const char* object = ( const char* )raptor_term_to_string( statement->object );
printf("%s\t-%s\n%.5s\n", subject, predicate, object);
printf("%s\t-%s\n%.50s\n", subject, predicate, object);
}
// ----------------------------------------------------------------------------
@ -136,9 +138,20 @@ void
handleFeedStatement ( Channel** chanPtr, raptor_statement* statement )
{
Channel* chan = *(chanPtr);
BString predicate = BString(( const char* )raptor_term_to_string( statement->predicate ));
BString subject = BString(( const char* )raptor_term_to_string( statement->subject ));
BString object = BString(( const char* )raptor_term_to_string( statement->object ));
const char* cpredicate = (const char*)raptor_term_to_string( statement->predicate );
const char* csubject = (const char*)raptor_term_to_string( statement->subject );
const char* cobject = (const char*)raptor_term_to_string( statement->object );
BString predicate = BString(cpredicate);
BString subject = BString(csubject);
BString bobject = BString(cobject);
bobject.ReplaceAll("\\\"","\"");
bobject.ReplaceFirst("\"","");
bobject.ReplaceLast("\"","");
std::string object = unescape(bobject.String());
predicate = getPredicateTag( predicate );
if ( predicate == "type" && getPredicateTag( object ) == "channel" )
@ -157,7 +170,7 @@ handleChannelStatement ( Channel** chanPtr, BString predicate, BString object )
}
void
handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, BString object )
handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, std::string object )
{
Channel* chan = *(chanPtr);
if ( subject.StartsWith("_:genid") )
@ -169,7 +182,7 @@ handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, BSt
chan->lastSubject = subject;
Item* newItem = (Item*)malloc( sizeof(Item) );
newItem = new Item( subject );
newItem = new Item( subject, chan->outputDir );
chan->items.AddItem( newItem );
}
@ -177,10 +190,19 @@ handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, BSt
Item* nowItem = (Item*)chan->items.LastItem();
if ( predicate == "title" )
nowItem->title = object;
nowItem->title = BString(object.c_str());
if ( predicate == "encoded" || predicate == "Atomcontent" )
nowItem->content = object;
if ( predicate == "description" )
nowItem->description = BString(object.c_str());
if ( predicate == "link" || predicate == "Atomlink" )
nowItem->postUrl = BString(object.c_str());
if ( predicate == "Atomhref" )
nowItem->postUrl = BString(object.c_str());
if ( predicate == "date" || predicate == "Atompublished" ) // 2019-02-18T01:43:43Z
nowItem->pubDate = BString(object.c_str());
if ( predicate == "pubDate" ) // Sun, 17 Feb 2019 19:43:43 -0600
nowItem->pubDate = BString(object.c_str());
}
@ -197,9 +219,93 @@ getPredicateTag ( BString spec )
return spec;
}
BString
getPredicateTag ( char* spec )
getPredicateTag ( const char* spec )
{
return getPredicateTag( BString(spec) );
}
BString
getPredicateTag ( std::string spec )
{
return getPredicateTag( spec.c_str() );
}
// ----------------------------------------------------------------------------
/* What ensues is a terrifying violation of the human form.
* Just atrotious. I deserve to be impaled by by an ice-pick.
* ... something (unfortunately), directly ripped from StackOverflow.
* So when getting a raptor_statement's object, it's a char array filled
* with escaped characters (\U2901, etc).
* I'm really not sure how to best manage this, so SO.
* Thanks remy-lebeau, I owe you.
* https://stackoverflow.com/questions/28534221 */
std::string
toUtf8 ( uint32_t cp )
{
std::string result;
int count;
if (cp <= 0x007F)
count = 1;
else if (cp <= 0x07FF)
count = 2;
else if (cp <= 0xFFFF)
count = 3;
else if (cp <= 0x10FFFF)
count = 4;
else
return result; // or throw an exception
result.resize(count);
if (count > 1) {
for (int i = count-1; i > 0; --i) {
result[i] = (char) (0x80 | (cp & 0x3F));
cp >>= 6;
}
for (int i = 0; i < count; ++i)
cp |= (1 << (7-i));
}
result[0] = (char) cp;
return result;
}
std::string
unescape ( std::string str, std::string escape )
{
std::string::size_type startIdx = 0;
do
{
startIdx = str.find(escape, startIdx);
if (startIdx == std::string::npos) break;
std::string::size_type endIdx = str.find_first_not_of("0123456789abcdefABCDEF",
startIdx+2);
if (endIdx == std::string::npos) break;
std::string tmpStr = str.substr(startIdx+2, endIdx-(startIdx+2));
std::istringstream iss(tmpStr);
uint32_t cp;
if (iss >> std::hex >> cp)
{
std::string utf8 = toUtf8(cp);
str.replace(startIdx, 2+tmpStr.length(), utf8);
startIdx += utf8.length();
}
else
startIdx += 2;
}
while (true);
return str;
}
std::string
unescape (const char* str )
{
return unescape(std::string( unescape(std::string(str), "\\u") ), "\\U");
}

View File

@ -1,6 +1,7 @@
#ifndef PARSE_H
#define PARSE_H
#include <iostream>
#include <raptor2/raptor2.h>
#include "Channel.h"
@ -9,7 +10,7 @@ void feedParser (Channel**);
void feedHandler ( void*, raptor_statement* );
void handleFeedStatement ( Channel**, raptor_statement* );
void handleChannelStatement ( Channel**, BString, BString );
void handleItemStatement ( Channel**, BString, BString, BString );
void handleItemStatement ( Channel**, BString, BString, std::string );
int countItemParser ( const char* );
void countItemHandler ( void*, raptor_statement* );
@ -17,7 +18,11 @@ void countItemHandler ( void*, raptor_statement* );
void printStatementParser ( const char* );
void printStatementHandler ( void*, raptor_statement* );
BString getPredicateTag ( char* );
BString getPredicateTag ( const char* );
BString getPredicateTag ( BString );
BString getPredicateTag ( std::string );
std::string to_utf ( uint32 );
std::string unescape ( std::string, std::string );
std::string unescape ( const char* );
#endif

64
test/xwx.xml Normal file
View File

@ -0,0 +1,64 @@
<?xml version="1.0" ?>
<rss version="2.0">
<channel>
<title>galactic station xwx</title>
<link>http://localhost:8000</link>
<description></description>
<item>
<title>La Haiku Funkcisistemo k Esperanto</title>
<link>http://localhost:8000../lib/haiku-k-esperanto.html</link>
<description></description>
<pubDate>Sat, 9 May 2020 01:27:32 -0600</pubDate>
</item>
<item>
<title>Preter Vim: hoj Kakoune!</title>
<link>http://localhost:8000../lib/preter-vim-al-kak.html</link>
<description></description>
<pubDate>Thu, 02 Jan 2019 00:05:20 -0600</pubDate>
</item>
<item>
<title>New domain - Novjaro k novnom</title>
<link>http://localhost:8000../lib/nova-retejnomo.html</link>
<description></description>
<pubDate>Web, 1 Jan 2020 19:43:43 -0600</pubDate>
</item>
<item>
<title>Trans la Rivera Lavejo</title>
<link>http://localhost:8000../lib/trans-la-rivero.html</link>
<description></description>
<pubDate>Wed, 13 Nov 2019 13:55:23 -0600</pubDate>
</item>
<item>
<title>Arteco kaj Derivfikcio</title>
<link>http://localhost:8000../lib/artec-kaj-fanatikfikci.html</link>
<description></description>
<pubDate>Sat, 2 Nov 2019 00:52:44 -0600</pubDate>
</item>
<item>
<title>Universalismo kaj fikciaj bestaĉoj</title>
<link>http://localhost:8000../lib/universalismo-kaj-ficiaj-bestaĉoj.html</link>
<description></description>
<pubDate>Mon, 15 Jul 2019 22:05:32 -0600</pubDate>
</item>
<item>
<title>Project Diva f with English subs</title>
<link>http://localhost:8000../lib/project-diva-f-better-english.html</link>
<description></description>
<pubDate>Wed, 26 Jun 2019 22:37:56 -0600</pubDate>
</item>
<item>
<title>SBCL k plibonigita terminalo</title>
<link>http://localhost:8000../lib/sbcl-k-plibonigita-terminalo.html</link>
<description></description>
<pubDate>Wed, 19 Jun 2019 20:21:01 -0600</pubDate>
</item>
<item>
<title>Cowsay and Rewarding HTML</title>
<link>http://localhost:8000../lib/cowsay-and-html.html</link>
<description></description>
<pubDate>Sun, 17 Feb 2019 19:43:43 -0600</pubDate>
</item>
</channel>
</rss>