Basic 'filetizing' of feed Channels and Items
This commit is contained in:
parent
00c42a860c
commit
4d4e6bad78
|
@ -4,7 +4,7 @@
|
|||
#include "Item.h"
|
||||
#include "parsing.h"
|
||||
|
||||
Channel::Channel ( BString path )
|
||||
Channel::Channel ( BString path, BString outputPath )
|
||||
{
|
||||
title = BString("Untitled Feed");
|
||||
description = BString("Nondescript, N/A.");
|
||||
|
@ -13,6 +13,7 @@ Channel::Channel ( BString path )
|
|||
filePath = path;
|
||||
topLevelSubject = "";
|
||||
lastSubject = "";
|
||||
outputDir = outputPath;
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -19,9 +19,10 @@ public:
|
|||
BString topLevelSubject;
|
||||
BString lastSubject;
|
||||
BString filePath;
|
||||
BString outputDir;
|
||||
|
||||
|
||||
Channel ( BString );
|
||||
Channel ( BString, BString );
|
||||
// Channel ( BEntry );
|
||||
// Channel ( BUrl );
|
||||
void Parse ( void );
|
||||
|
|
44
src/Item.cpp
44
src/Item.cpp
|
@ -1,13 +1,51 @@
|
|||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <raptor2/raptor2.h>
|
||||
#include <StorageKit.h>
|
||||
#include "Item.h"
|
||||
|
||||
Item::Item ( BString localSubject )
|
||||
Item::Item ( BString localSubject, BString outputPath )
|
||||
{
|
||||
subject = localSubject;
|
||||
title = BString("");
|
||||
description = BString("");
|
||||
homePage = BString("");
|
||||
postUrl = BString("");
|
||||
content = BString("");
|
||||
content = "";
|
||||
pubDate = BString("");
|
||||
outputDir = outputPath;
|
||||
}
|
||||
|
||||
bool
|
||||
Item::Filetize ( bool onlyIfNew = false )
|
||||
{
|
||||
BDirectory* dir = new BDirectory( outputDir );
|
||||
BFile* file = new BFile( title.String(), B_READ_WRITE );
|
||||
|
||||
dir->CreateFile( title.String(), file );
|
||||
|
||||
BString betype = "text/html";
|
||||
|
||||
file->WriteAttr( "META:title", B_STRING_TYPE, 0,
|
||||
title.String(), title.CountChars() );
|
||||
file->WriteAttr( "description", B_STRING_TYPE, 0,
|
||||
description.String(), description.CountChars() );
|
||||
file->WriteAttr( "pubDate", B_STRING_TYPE, 0,
|
||||
pubDate.String(), pubDate.CountChars() );
|
||||
file->WriteAttr( "META:url", B_STRING_TYPE, 0,
|
||||
postUrl.String(), postUrl.CountChars() );
|
||||
file->WriteAttr( "BEOS:TYPE", B_STRING_TYPE, 0,
|
||||
betype.String(), betype.CountChars() );
|
||||
|
||||
// using file->Write with content converted to C string messes up length ofc
|
||||
// this is required to preserve length (because of UTF char substitutions in parsing.cpp)
|
||||
const char* strPath = outputDir.String();
|
||||
std::string path(strPath);
|
||||
path += std::string(title.String());
|
||||
std::cout << path << std::endl;
|
||||
|
||||
std::ofstream pFile(path);
|
||||
pFile << content;
|
||||
pFile.close();
|
||||
return false;
|
||||
}
|
||||
|
|
11
src/Item.h
11
src/Item.h
|
@ -1,6 +1,7 @@
|
|||
#ifndef ITEM_H
|
||||
#define ITEM_H
|
||||
|
||||
#include <iostream>
|
||||
#include <DateTime.h>
|
||||
#include <String.h>
|
||||
#include <List.h>
|
||||
|
@ -10,15 +11,17 @@ class Item {
|
|||
public:
|
||||
BString title;
|
||||
BString description;
|
||||
BDate pubDate;
|
||||
BString pubDate;
|
||||
BString homePage;
|
||||
BString postUrl;
|
||||
BString content;
|
||||
std::string content;
|
||||
BString outputDir;
|
||||
|
||||
BString subject;
|
||||
|
||||
void Print ( void );
|
||||
Item ( BString );
|
||||
Item ( BString, BString );
|
||||
|
||||
bool Filetize ( bool );
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -2,40 +2,27 @@
|
|||
#include <StorageKit.h>
|
||||
#include "Channel.h"
|
||||
#include "Item.h"
|
||||
#include "parsing.h"
|
||||
#include "parsing.h" //
|
||||
|
||||
|
||||
bool
|
||||
create_item ( void* item )
|
||||
{
|
||||
Item* itemPtr = (Item*)item;
|
||||
|
||||
BDirectory* dir = new BDirectory("./test/test/");
|
||||
BFile* file = new BFile(itemPtr->title.String(), B_READ_WRITE);
|
||||
|
||||
dir->CreateFile(itemPtr->title.String(), file);
|
||||
|
||||
file->WriteAttr("title",B_STRING_TYPE,0,
|
||||
itemPtr->title.String(),itemPtr->title.CountChars());
|
||||
file->WriteAttr("description",B_STRING_TYPE,0,
|
||||
itemPtr->description.String(),itemPtr->description.CountChars());
|
||||
|
||||
// const char* buf;
|
||||
// buf = itemPtr->title.String();
|
||||
file->Write(itemPtr->title.String(), itemPtr->title.CountChars());
|
||||
|
||||
Item* itemPtr = (Item*)item;
|
||||
itemPtr->Filetize( false );
|
||||
return false;
|
||||
}
|
||||
|
||||
int
|
||||
main ( int argc, char** argv )
|
||||
{
|
||||
BString outputDir("/boot/home/feeds/");
|
||||
Channel* chan = (Channel*)malloc( sizeof(Channel) );
|
||||
chan = new Channel(argv[1]);
|
||||
|
||||
chan = new Channel(argv[1], outputDir);
|
||||
chan->Parse();
|
||||
|
||||
BList items = chan->items;
|
||||
printf("%s\n", chan->title.String());
|
||||
items.DoForEach(&create_item);
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
154
src/parsing.cpp
154
src/parsing.cpp
|
@ -1,3 +1,5 @@
|
|||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <raptor2/raptor2.h>
|
||||
#include "Channel.h"
|
||||
#include "Item.h"
|
||||
|
@ -20,7 +22,7 @@ feedParser ( Channel** chanPtr )
|
|||
unsigned char *uri_string;
|
||||
raptor_uri *uri, *base_uri;
|
||||
|
||||
rss_parser = raptor_new_parser(world, "rss-tag-soup");
|
||||
rss_parser = raptor_new_parser( world, "rss-tag-soup" );
|
||||
uri_string = raptor_uri_filename_to_uri_string( chan->filePath.String() );
|
||||
uri = raptor_new_uri( world, uri_string );
|
||||
base_uri = raptor_uri_copy( uri );
|
||||
|
@ -28,10 +30,10 @@ feedParser ( Channel** chanPtr )
|
|||
raptor_parser_set_statement_handler( rss_parser, &chan, feedHandler );
|
||||
raptor_parser_parse_file( rss_parser, uri, base_uri );
|
||||
|
||||
raptor_free_parser(rss_parser);
|
||||
raptor_free_uri(base_uri);
|
||||
raptor_free_uri(uri);
|
||||
raptor_free_memory(uri_string);
|
||||
raptor_free_parser( rss_parser );
|
||||
raptor_free_uri( base_uri );
|
||||
raptor_free_uri( uri );
|
||||
raptor_free_memory( uri_string );
|
||||
raptor_free_world( world );
|
||||
}
|
||||
|
||||
|
@ -56,11 +58,11 @@ countItemParser ( const char* filePath )
|
|||
raptor_parser_set_statement_handler( rss_parser, &itemCount, countItemHandler );
|
||||
raptor_parser_parse_file( rss_parser, uri, base_uri );
|
||||
|
||||
free(itemCount);
|
||||
raptor_free_parser(rss_parser);
|
||||
raptor_free_uri(base_uri);
|
||||
raptor_free_uri(uri);
|
||||
raptor_free_memory(uri_string);
|
||||
free( itemCount );
|
||||
raptor_free_parser( rss_parser );
|
||||
raptor_free_uri( base_uri );
|
||||
raptor_free_uri( uri );
|
||||
raptor_free_memory( uri_string );
|
||||
raptor_free_world( world );
|
||||
|
||||
return *(itemCount);
|
||||
|
@ -84,10 +86,10 @@ printStatementParser ( const char* filePath )
|
|||
raptor_parser_set_statement_handler( rss_parser, NULL, printStatementHandler );
|
||||
raptor_parser_parse_file( rss_parser, uri, base_uri );
|
||||
|
||||
raptor_free_parser(rss_parser);
|
||||
raptor_free_uri(base_uri);
|
||||
raptor_free_uri(uri);
|
||||
raptor_free_memory(uri_string);
|
||||
raptor_free_parser( rss_parser );
|
||||
raptor_free_uri( base_uri );
|
||||
raptor_free_uri( uri );
|
||||
raptor_free_memory( uri_string );
|
||||
raptor_free_world( world );
|
||||
}
|
||||
|
||||
|
@ -127,7 +129,7 @@ printStatementHandler ( void* user_data, raptor_statement* statement )
|
|||
const char* predicate = ( const char* )raptor_term_to_string( statement->predicate );
|
||||
const char* object = ( const char* )raptor_term_to_string( statement->object );
|
||||
|
||||
printf("%s\t-%s\n%.5s\n", subject, predicate, object);
|
||||
printf("%s\t-%s\n%.50s\n", subject, predicate, object);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
@ -136,9 +138,20 @@ void
|
|||
handleFeedStatement ( Channel** chanPtr, raptor_statement* statement )
|
||||
{
|
||||
Channel* chan = *(chanPtr);
|
||||
BString predicate = BString(( const char* )raptor_term_to_string( statement->predicate ));
|
||||
BString subject = BString(( const char* )raptor_term_to_string( statement->subject ));
|
||||
BString object = BString(( const char* )raptor_term_to_string( statement->object ));
|
||||
const char* cpredicate = (const char*)raptor_term_to_string( statement->predicate );
|
||||
const char* csubject = (const char*)raptor_term_to_string( statement->subject );
|
||||
const char* cobject = (const char*)raptor_term_to_string( statement->object );
|
||||
|
||||
BString predicate = BString(cpredicate);
|
||||
BString subject = BString(csubject);
|
||||
BString bobject = BString(cobject);
|
||||
|
||||
bobject.ReplaceAll("\\\"","\"");
|
||||
bobject.ReplaceFirst("\"","");
|
||||
bobject.ReplaceLast("\"","");
|
||||
|
||||
std::string object = unescape(bobject.String());
|
||||
|
||||
predicate = getPredicateTag( predicate );
|
||||
|
||||
if ( predicate == "type" && getPredicateTag( object ) == "channel" )
|
||||
|
@ -157,7 +170,7 @@ handleChannelStatement ( Channel** chanPtr, BString predicate, BString object )
|
|||
}
|
||||
|
||||
void
|
||||
handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, BString object )
|
||||
handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, std::string object )
|
||||
{
|
||||
Channel* chan = *(chanPtr);
|
||||
if ( subject.StartsWith("_:genid") )
|
||||
|
@ -169,7 +182,7 @@ handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, BSt
|
|||
chan->lastSubject = subject;
|
||||
|
||||
Item* newItem = (Item*)malloc( sizeof(Item) );
|
||||
newItem = new Item( subject );
|
||||
newItem = new Item( subject, chan->outputDir );
|
||||
|
||||
chan->items.AddItem( newItem );
|
||||
}
|
||||
|
@ -177,10 +190,19 @@ handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, BSt
|
|||
Item* nowItem = (Item*)chan->items.LastItem();
|
||||
|
||||
if ( predicate == "title" )
|
||||
nowItem->title = object;
|
||||
|
||||
nowItem->title = BString(object.c_str());
|
||||
if ( predicate == "encoded" || predicate == "Atomcontent" )
|
||||
nowItem->content = object;
|
||||
if ( predicate == "description" )
|
||||
nowItem->description = BString(object.c_str());
|
||||
if ( predicate == "link" || predicate == "Atomlink" )
|
||||
nowItem->postUrl = BString(object.c_str());
|
||||
if ( predicate == "Atomhref" )
|
||||
nowItem->postUrl = BString(object.c_str());
|
||||
if ( predicate == "date" || predicate == "Atompublished" ) // 2019-02-18T01:43:43Z
|
||||
nowItem->pubDate = BString(object.c_str());
|
||||
if ( predicate == "pubDate" ) // Sun, 17 Feb 2019 19:43:43 -0600
|
||||
nowItem->pubDate = BString(object.c_str());
|
||||
}
|
||||
|
||||
|
||||
|
@ -197,9 +219,93 @@ getPredicateTag ( BString spec )
|
|||
|
||||
return spec;
|
||||
}
|
||||
|
||||
BString
|
||||
getPredicateTag ( char* spec )
|
||||
getPredicateTag ( const char* spec )
|
||||
{
|
||||
return getPredicateTag( BString(spec) );
|
||||
}
|
||||
BString
|
||||
getPredicateTag ( std::string spec )
|
||||
{
|
||||
return getPredicateTag( spec.c_str() );
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/* What ensues is a terrifying violation of the human form.
|
||||
* Just atrotious. I deserve to be impaled by by an ice-pick.
|
||||
* ... something (unfortunately), directly ripped from StackOverflow.
|
||||
* So when getting a raptor_statement's object, it's a char array filled
|
||||
* with escaped characters (\U2901, etc).
|
||||
* I'm really not sure how to best manage this, so SO.
|
||||
* Thanks remy-lebeau, I owe you.
|
||||
* https://stackoverflow.com/questions/28534221 */
|
||||
std::string
|
||||
toUtf8 ( uint32_t cp )
|
||||
{
|
||||
std::string result;
|
||||
|
||||
int count;
|
||||
if (cp <= 0x007F)
|
||||
count = 1;
|
||||
else if (cp <= 0x07FF)
|
||||
count = 2;
|
||||
else if (cp <= 0xFFFF)
|
||||
count = 3;
|
||||
else if (cp <= 0x10FFFF)
|
||||
count = 4;
|
||||
else
|
||||
return result; // or throw an exception
|
||||
|
||||
result.resize(count);
|
||||
|
||||
if (count > 1) {
|
||||
for (int i = count-1; i > 0; --i) {
|
||||
result[i] = (char) (0x80 | (cp & 0x3F));
|
||||
cp >>= 6;
|
||||
}
|
||||
|
||||
for (int i = 0; i < count; ++i)
|
||||
cp |= (1 << (7-i));
|
||||
}
|
||||
|
||||
result[0] = (char) cp;
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string
|
||||
unescape ( std::string str, std::string escape )
|
||||
{
|
||||
std::string::size_type startIdx = 0;
|
||||
do
|
||||
{
|
||||
startIdx = str.find(escape, startIdx);
|
||||
if (startIdx == std::string::npos) break;
|
||||
|
||||
std::string::size_type endIdx = str.find_first_not_of("0123456789abcdefABCDEF",
|
||||
startIdx+2);
|
||||
if (endIdx == std::string::npos) break;
|
||||
|
||||
std::string tmpStr = str.substr(startIdx+2, endIdx-(startIdx+2));
|
||||
std::istringstream iss(tmpStr);
|
||||
|
||||
uint32_t cp;
|
||||
if (iss >> std::hex >> cp)
|
||||
{
|
||||
std::string utf8 = toUtf8(cp);
|
||||
str.replace(startIdx, 2+tmpStr.length(), utf8);
|
||||
startIdx += utf8.length();
|
||||
}
|
||||
else
|
||||
startIdx += 2;
|
||||
}
|
||||
while (true);
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
std::string
|
||||
unescape (const char* str )
|
||||
{
|
||||
return unescape(std::string( unescape(std::string(str), "\\u") ), "\\U");
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#ifndef PARSE_H
|
||||
#define PARSE_H
|
||||
|
||||
#include <iostream>
|
||||
#include <raptor2/raptor2.h>
|
||||
#include "Channel.h"
|
||||
|
||||
|
@ -9,7 +10,7 @@ void feedParser (Channel**);
|
|||
void feedHandler ( void*, raptor_statement* );
|
||||
void handleFeedStatement ( Channel**, raptor_statement* );
|
||||
void handleChannelStatement ( Channel**, BString, BString );
|
||||
void handleItemStatement ( Channel**, BString, BString, BString );
|
||||
void handleItemStatement ( Channel**, BString, BString, std::string );
|
||||
|
||||
int countItemParser ( const char* );
|
||||
void countItemHandler ( void*, raptor_statement* );
|
||||
|
@ -17,7 +18,11 @@ void countItemHandler ( void*, raptor_statement* );
|
|||
void printStatementParser ( const char* );
|
||||
void printStatementHandler ( void*, raptor_statement* );
|
||||
|
||||
BString getPredicateTag ( char* );
|
||||
BString getPredicateTag ( const char* );
|
||||
BString getPredicateTag ( BString );
|
||||
BString getPredicateTag ( std::string );
|
||||
std::string to_utf ( uint32 );
|
||||
std::string unescape ( std::string, std::string );
|
||||
std::string unescape ( const char* );
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
<?xml version="1.0" ?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>galactic station xwx</title>
|
||||
<link>http://localhost:8000</link>
|
||||
<description></description>
|
||||
|
||||
<item>
|
||||
<title>La Haiku Funkcisistemo k Esperanto</title>
|
||||
<link>http://localhost:8000../lib/haiku-k-esperanto.html</link>
|
||||
<description></description>
|
||||
<pubDate>Sat, 9 May 2020 01:27:32 -0600</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>Preter Vim: hoj Kakoune!</title>
|
||||
<link>http://localhost:8000../lib/preter-vim-al-kak.html</link>
|
||||
<description></description>
|
||||
<pubDate>Thu, 02 Jan 2019 00:05:20 -0600</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>New domain - Novjaro k novnom</title>
|
||||
<link>http://localhost:8000../lib/nova-retejnomo.html</link>
|
||||
<description></description>
|
||||
<pubDate>Web, 1 Jan 2020 19:43:43 -0600</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>Trans la Rivera Lavejo</title>
|
||||
<link>http://localhost:8000../lib/trans-la-rivero.html</link>
|
||||
<description></description>
|
||||
<pubDate>Wed, 13 Nov 2019 13:55:23 -0600</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>Arteco kaj Derivfikcio</title>
|
||||
<link>http://localhost:8000../lib/artec-kaj-fanatikfikci.html</link>
|
||||
<description></description>
|
||||
<pubDate>Sat, 2 Nov 2019 00:52:44 -0600</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>Universalismo kaj fikciaj bestaĉoj</title>
|
||||
<link>http://localhost:8000../lib/universalismo-kaj-ficiaj-bestaĉoj.html</link>
|
||||
<description></description>
|
||||
<pubDate>Mon, 15 Jul 2019 22:05:32 -0600</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>Project Diva f with English subs</title>
|
||||
<link>http://localhost:8000../lib/project-diva-f-better-english.html</link>
|
||||
<description></description>
|
||||
<pubDate>Wed, 26 Jun 2019 22:37:56 -0600</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>SBCL k plibonigita terminalo</title>
|
||||
<link>http://localhost:8000../lib/sbcl-k-plibonigita-terminalo.html</link>
|
||||
<description></description>
|
||||
<pubDate>Wed, 19 Jun 2019 20:21:01 -0600</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>Cowsay and Rewarding HTML</title>
|
||||
<link>http://localhost:8000../lib/cowsay-and-html.html</link>
|
||||
<description></description>
|
||||
<pubDate>Sun, 17 Feb 2019 19:43:43 -0600</pubDate>
|
||||
</item>
|
||||
|
||||
</channel>
|
||||
</rss>
|
Ŝarĝante…
Reference in New Issue