Basic 'filetizing' of feed Channels and Items

2020-06-20 21:15:34 -05:00 · 2020-06-20 21:15:34 -05:00 · 4d4e6bad78
parent 00c42a860c
commit 4d4e6bad78
8 changed files with 263 additions and 58 deletions
--- a/src/Channel.cpp
+++ b/src/Channel.cpp
@ -4,7 +4,7 @@
 #include "Item.h"
 #include "parsing.h"

-Channel::Channel ( BString path )
+Channel::Channel ( BString path, BString outputPath )
 {
 	title = BString("Untitled Feed");
 	description = BString("Nondescript, N/A.");
@ -13,6 +13,7 @@ Channel::Channel ( BString path )
 	filePath = path;
 	topLevelSubject = "";
 	lastSubject = "";
+	outputDir = outputPath;
 }

 void
--- a/src/Channel.h
+++ b/src/Channel.h
@ -19,9 +19,10 @@ public:
 	BString topLevelSubject;
 	BString lastSubject;
 	BString	filePath;
+	BString outputDir;


-	Channel	( BString );
+	Channel	( BString, BString );
 //	Channel	( BEntry );
 //	Channel	( BUrl );
 	void	Parse	( void );
--- a/src/Item.cpp
+++ b/src/Item.cpp
@ -1,13 +1,51 @@
-#include <cstdio>
+#include <iostream>
+#include <fstream>
 #include <raptor2/raptor2.h>
+#include <StorageKit.h>
 #include "Item.h"

-Item::Item ( BString localSubject )
+Item::Item ( BString localSubject, BString outputPath )
 {
 	subject	= localSubject;
 	title = BString("");
 	description = BString("");
 	homePage = BString("");
 	postUrl  = BString("");
-	content  = BString("");
+	content  = "";
+	pubDate  = BString("");
+	outputDir = outputPath;
+}
+
+bool
+Item::Filetize ( bool onlyIfNew = false )
+{
+	BDirectory* dir = new BDirectory( outputDir );
+	BFile* file = new BFile( title.String(), B_READ_WRITE );
+
+	dir->CreateFile( title.String(), file );
+
+	BString betype = "text/html";
+
+	file->WriteAttr( "META:title", B_STRING_TYPE, 0,
+			 title.String(), title.CountChars() );
+	file->WriteAttr( "description", B_STRING_TYPE, 0,
+			 description.String(), description.CountChars() );
+	file->WriteAttr( "pubDate", B_STRING_TYPE, 0,
+			 pubDate.String(), pubDate.CountChars() );
+	file->WriteAttr( "META:url", B_STRING_TYPE, 0,
+			 postUrl.String(), postUrl.CountChars() );
+	file->WriteAttr( "BEOS:TYPE", B_STRING_TYPE, 0,
+			 betype.String(), betype.CountChars() );
+
+	// using file->Write with content converted to C string messes up length ofc
+	// this is required to preserve length (because of UTF char substitutions in parsing.cpp)
+	const char* strPath = outputDir.String();
+	std::string path(strPath);
+	path += std::string(title.String());
+	std::cout << path << std::endl;
+
+	std::ofstream pFile(path);
+	pFile << content;
+	pFile.close();
+	return false;
 }
--- a/src/Item.h
+++ b/src/Item.h
@ -1,6 +1,7 @@
 #ifndef ITEM_H
 #define ITEM_H

+#include <iostream>
 #include <DateTime.h>
 #include <String.h>
 #include <List.h>
@ -10,15 +11,17 @@ class Item {
 public:
 	BString  title;
 	BString  description;
-	BDate    pubDate;
+	BString  pubDate;
 	BString  homePage;
 	BString  postUrl;
-	BString  content;
+	std::string  content;
+	BString outputDir;
 	
 	BString  subject;

-	void	Print	( void );
-	Item	( BString );
+	Item	( BString, BString );
+
+	bool	Filetize ( bool );
 };

 #endif
--- a/src/Rifen.cpp
+++ b/src/Rifen.cpp
@ -2,40 +2,27 @@
 #include <StorageKit.h>
 #include "Channel.h"
 #include "Item.h"
-#include "parsing.h"
+#include "parsing.h" //
+

 bool
 create_item ( void* item )
 {
-	Item* itemPtr = (Item*)item;
-
-	BDirectory* dir    = new BDirectory("./test/test/");
-	BFile*      file   = new BFile(itemPtr->title.String(), B_READ_WRITE);
-
-	dir->CreateFile(itemPtr->title.String(), file);
-
-	file->WriteAttr("title",B_STRING_TYPE,0,
-			itemPtr->title.String(),itemPtr->title.CountChars());
-	file->WriteAttr("description",B_STRING_TYPE,0,
-			itemPtr->description.String(),itemPtr->description.CountChars());
-
-//	const char* buf;
-//	buf = itemPtr->title.String();
-	file->Write(itemPtr->title.String(), itemPtr->title.CountChars());
-
+	Item* itemPtr  = (Item*)item;
+	itemPtr->Filetize( false );
 	return false;
 }

 int
 main ( int argc, char** argv )
 {
+	BString outputDir("/boot/home/feeds/");
 	Channel* chan = (Channel*)malloc( sizeof(Channel) );
-	chan = new Channel(argv[1]);
-	chan->Parse();
-	BList items = chan->items;
-	printf("%s\n", chan->title.String());
-	items.DoForEach(&create_item);

-	
+	chan = new Channel(argv[1], outputDir);
+	chan->Parse();
+
+	BList items = chan->items;
+	items.DoForEach(&create_item);
 	return 0;
 }
--- a/src/parsing.cpp
+++ b/src/parsing.cpp
@ -1,3 +1,5 @@
+#include <iostream>
+#include <sstream>
 #include <raptor2/raptor2.h>
 #include "Channel.h"
 #include "Item.h"
@ -20,7 +22,7 @@ feedParser ( Channel** chanPtr )
 	unsigned char *uri_string;
 	raptor_uri *uri, *base_uri;

-	rss_parser = raptor_new_parser(world, "rss-tag-soup");
+	rss_parser = raptor_new_parser( world, "rss-tag-soup" );
 	uri_string = raptor_uri_filename_to_uri_string( chan->filePath.String() );
 	uri = raptor_new_uri( world, uri_string );
 	base_uri = raptor_uri_copy( uri );
@ -28,10 +30,10 @@ feedParser ( Channel** chanPtr )
 	raptor_parser_set_statement_handler( rss_parser, &chan, feedHandler );
 	raptor_parser_parse_file( rss_parser, uri, base_uri );

-	raptor_free_parser(rss_parser);
-	raptor_free_uri(base_uri);
-	raptor_free_uri(uri);
-	raptor_free_memory(uri_string);	
+	raptor_free_parser( rss_parser );
+	raptor_free_uri( base_uri );
+	raptor_free_uri( uri );
+	raptor_free_memory( uri_string );	
 	raptor_free_world( world );
 }

@ -56,11 +58,11 @@ countItemParser ( const char* filePath )
 	raptor_parser_set_statement_handler( rss_parser, &itemCount, countItemHandler );
 	raptor_parser_parse_file( rss_parser, uri, base_uri );

-	free(itemCount);
-	raptor_free_parser(rss_parser);
-	raptor_free_uri(base_uri);
-	raptor_free_uri(uri);
-	raptor_free_memory(uri_string);	
+	free( itemCount );
+	raptor_free_parser( rss_parser );
+	raptor_free_uri( base_uri );
+	raptor_free_uri( uri );
+	raptor_free_memory( uri_string );	
 	raptor_free_world( world );

 	return *(itemCount);
@ -84,10 +86,10 @@ printStatementParser ( const char* filePath )
 	raptor_parser_set_statement_handler( rss_parser, NULL, printStatementHandler );
 	raptor_parser_parse_file( rss_parser, uri, base_uri );

-	raptor_free_parser(rss_parser);
-	raptor_free_uri(base_uri);
-	raptor_free_uri(uri);
-	raptor_free_memory(uri_string);	
+	raptor_free_parser( rss_parser );
+	raptor_free_uri( base_uri );
+	raptor_free_uri( uri );
+	raptor_free_memory( uri_string );	
 	raptor_free_world( world );
 }

@ -127,7 +129,7 @@ printStatementHandler ( void* user_data, raptor_statement* statement )
 	const char* predicate = ( const char* )raptor_term_to_string( statement->predicate );
 	const char* object    = ( const char* )raptor_term_to_string( statement->object );

-	printf("%s\t-%s\n%.5s\n", subject, predicate, object);
+	printf("%s\t-%s\n%.50s\n", subject, predicate, object);
 }

 // ----------------------------------------------------------------------------
@ -136,9 +138,20 @@ void
 handleFeedStatement ( Channel** chanPtr, raptor_statement* statement )
 {
 	Channel* chan = *(chanPtr);
-	BString predicate = BString(( const char* )raptor_term_to_string( statement->predicate ));
-	BString subject   = BString(( const char* )raptor_term_to_string( statement->subject ));
-	BString object    = BString(( const char* )raptor_term_to_string( statement->object ));
+	const char* cpredicate = (const char*)raptor_term_to_string( statement->predicate );
+	const char* csubject   = (const char*)raptor_term_to_string( statement->subject );
+	const char* cobject    = (const char*)raptor_term_to_string( statement->object );
+	
+	BString predicate = BString(cpredicate);
+	BString subject   = BString(csubject);
+	BString bobject    = BString(cobject);
+
+	bobject.ReplaceAll("\\\"","\"");
+	bobject.ReplaceFirst("\"","");
+	bobject.ReplaceLast("\"","");
+
+	std::string object  = unescape(bobject.String());
+
 	predicate = getPredicateTag( predicate );

 	if ( predicate == "type" && getPredicateTag( object ) == "channel" )
@ -157,7 +170,7 @@ handleChannelStatement ( Channel** chanPtr, BString predicate, BString object )
 }

 void
-handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, BString object )
+handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, std::string object )
 {
 	Channel* chan = *(chanPtr);
 	if ( subject.StartsWith("_:genid") )
@ -169,7 +182,7 @@ handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, BSt
 		chan->lastSubject = subject;

 		Item* newItem = (Item*)malloc( sizeof(Item) );
-		newItem = new Item( subject );
+		newItem = new Item( subject, chan->outputDir );

 		chan->items.AddItem( newItem );
 	}
@ -177,10 +190,19 @@ handleItemStatement ( Channel** chanPtr, BString subject, BString predicate, BSt
 	Item* nowItem = (Item*)chan->items.LastItem();
 	
 	if ( predicate == "title" ) 
-		nowItem->title = object;
-
+		nowItem->title = BString(object.c_str());
 	if ( predicate == "encoded" || predicate == "Atomcontent" ) 
 		nowItem->content = object;
+	if ( predicate == "description" )
+		nowItem->description = BString(object.c_str());
+	if ( predicate == "link" || predicate == "Atomlink" )
+		nowItem->postUrl = BString(object.c_str());
+	if ( predicate == "Atomhref" )
+		nowItem->postUrl = BString(object.c_str());
+	if ( predicate == "date" || predicate == "Atompublished" ) // 2019-02-18T01:43:43Z
+		nowItem->pubDate = BString(object.c_str());
+	if ( predicate == "pubDate" )  // Sun, 17 Feb 2019 19:43:43 -0600
+		nowItem->pubDate = BString(object.c_str());
 }


@ -197,9 +219,93 @@ getPredicateTag ( BString spec )

 	return spec;
 }
-
 BString
-getPredicateTag ( char* spec )
+getPredicateTag ( const char* spec )
 {
 	return getPredicateTag( BString(spec) );
 }
+BString
+getPredicateTag ( std::string spec )
+{
+	return getPredicateTag( spec.c_str() );
+}
+
+// ----------------------------------------------------------------------------
+
+/* What ensues is a terrifying violation of the human form.
+ * Just atrotious. I deserve to be impaled by by an ice-pick.
+ * ... something (unfortunately), directly ripped from StackOverflow.
+ * So when getting a raptor_statement's object, it's a char array filled
+ * with escaped characters (\U2901, etc).
+ * I'm really not sure how to best manage this, so SO.
+ * Thanks remy-lebeau, I owe you.
+ * https://stackoverflow.com/questions/28534221 */
+std::string
+toUtf8 ( uint32_t cp )
+{
+    std::string result;
+
+    int count;
+    if (cp <= 0x007F)
+        count = 1;
+    else if (cp <= 0x07FF)
+        count = 2;
+    else if (cp <= 0xFFFF)
+        count = 3;
+    else if (cp <= 0x10FFFF)
+        count = 4;
+    else
+        return result; // or throw an exception
+
+    result.resize(count);
+
+    if (count > 1) {
+        for (int i = count-1; i > 0; --i) {
+            result[i] = (char) (0x80 | (cp & 0x3F));
+            cp >>= 6;
+        }
+
+        for (int i = 0; i < count; ++i)
+            cp |= (1 << (7-i));
+    }
+
+    result[0] = (char) cp;
+    return result;
+}
+
+std::string
+unescape ( std::string str, std::string escape )
+{
+	std::string::size_type startIdx = 0;
+	do
+	{
+		startIdx = str.find(escape, startIdx);
+		if (startIdx == std::string::npos) break;
+
+		std::string::size_type endIdx = str.find_first_not_of("0123456789abcdefABCDEF",
+								      startIdx+2);
+		if (endIdx == std::string::npos) break;
+	
+		std::string tmpStr = str.substr(startIdx+2, endIdx-(startIdx+2));
+		std::istringstream iss(tmpStr);
+
+		uint32_t cp;
+		if (iss >> std::hex >> cp)
+		{
+		        std::string utf8 = toUtf8(cp);
+		        str.replace(startIdx, 2+tmpStr.length(), utf8);
+		        startIdx += utf8.length();
+		}
+		else
+			startIdx += 2;
+	}
+	while (true);	
+
+	return str;
+}
+
+std::string
+unescape (const char* str )
+{
+	return unescape(std::string( unescape(std::string(str), "\\u") ), "\\U");
+}
--- a/src/parsing.h
+++ b/src/parsing.h
@ -1,6 +1,7 @@
 #ifndef PARSE_H
 #define PARSE_H

+#include <iostream>
 #include <raptor2/raptor2.h>
 #include "Channel.h"

@ -9,7 +10,7 @@ void	feedParser		(Channel**);
 void	feedHandler		( void*, raptor_statement* );
 void	handleFeedStatement	( Channel**, raptor_statement* );
 void	handleChannelStatement	( Channel**, BString, BString );
-void	handleItemStatement	( Channel**, BString, BString, BString );
+void	handleItemStatement	( Channel**, BString, BString, std::string );

 int	countItemParser		( const char* );
 void	countItemHandler	( void*, raptor_statement* );
@ -17,7 +18,11 @@ void	countItemHandler	( void*, raptor_statement* );
 void	printStatementParser	( const char* );
 void	printStatementHandler	( void*, raptor_statement* );

-BString	getPredicateTag		( char* );
+BString	getPredicateTag		( const char* );
 BString	getPredicateTag		( BString );
+BString	getPredicateTag		( std::string );
+std::string	to_utf		( uint32 );
+std::string	unescape	( std::string, std::string );
+std::string	unescape	( const char* );

 #endif
--- a/test/xwx.xml
+++ b/test/xwx.xml
@ -0,0 +1,64 @@
+<?xml version="1.0" ?>
+<rss version="2.0">
+<channel>
+  <title>galactic station xwx</title>
+  <link>http://localhost:8000</link>
+  <description></description>
+
+	<item>
+	<title>La Haiku Funkcisistemo k Esperanto</title>
+  <link>http://localhost:8000../lib/haiku-k-esperanto.html</link>
+	<description></description>
+	<pubDate>Sat, 9 May 2020 01:27:32 -0600</pubDate>
+</item>
+<item>
+	<title>Preter Vim: hoj Kakoune!</title>
+  <link>http://localhost:8000../lib/preter-vim-al-kak.html</link>
+	<description></description>
+	<pubDate>Thu, 02 Jan 2019 00:05:20 -0600</pubDate>
+</item>
+<item>
+	<title>New domain - Novjaro k novnom</title>
+  <link>http://localhost:8000../lib/nova-retejnomo.html</link>
+	<description></description>
+	<pubDate>Web, 1 Jan 2020 19:43:43 -0600</pubDate>
+</item>
+<item>
+	<title>Trans la Rivera Lavejo</title>
+  <link>http://localhost:8000../lib/trans-la-rivero.html</link>
+	<description></description>
+	<pubDate>Wed, 13 Nov 2019 13:55:23 -0600</pubDate>
+</item>
+<item>
+	<title>Arteco kaj Derivfikcio</title>
+  <link>http://localhost:8000../lib/artec-kaj-fanatikfikci.html</link>
+	<description></description>
+	<pubDate>Sat, 2 Nov 2019 00:52:44 -0600</pubDate>
+</item>
+<item>
+	<title>Universalismo kaj fikciaj bestaĉoj</title>
+  <link>http://localhost:8000../lib/universalismo-kaj-ficiaj-bestaĉoj.html</link>
+	<description></description>
+	<pubDate>Mon, 15 Jul 2019 22:05:32 -0600</pubDate>
+</item>
+<item>
+	<title>Project Diva f with English subs</title>
+  <link>http://localhost:8000../lib/project-diva-f-better-english.html</link>
+	<description></description>
+	<pubDate>Wed, 26 Jun 2019 22:37:56 -0600</pubDate>
+</item>
+<item>
+	<title>SBCL k plibonigita terminalo</title>
+  <link>http://localhost:8000../lib/sbcl-k-plibonigita-terminalo.html</link>
+	<description></description>
+	<pubDate>Wed, 19 Jun 2019 20:21:01 -0600</pubDate>
+</item>
+<item>
+	<title>Cowsay and Rewarding HTML</title>
+  <link>http://localhost:8000../lib/cowsay-and-html.html</link>
+	<description></description>
+	<pubDate>Sun, 17 Feb 2019 19:43:43 -0600</pubDate>
+</item>
+
+</channel>
+</rss>