X-Git-Url: https://harishankar.org/repos/?p=biaweb_qt.git;a=blobdiff_plain;f=biaweb_exporter.py;h=6cba7065096e61c2ab96244d5ab963de2df9b89f;hp=5bbe32b906b484f746f0eca72f072686a8f2a516;hb=ba3069991aeda91317057aa2c115f8c2151d0a98;hpb=f37ae5235dba470cf02e25a885b83a2bc9c78694
diff --git a/biaweb_exporter.py b/biaweb_exporter.py
index 5bbe32b..6cba706 100644
--- a/biaweb_exporter.py
+++ b/biaweb_exporter.py
@@ -3,11 +3,34 @@
import os
import os.path
+import sys
import time
import sqlite3
import string
import shutil
+import HTMLParser
+import cPickle
import biaweb_db
+import biaweb_strings
+
+# class to remove HTML tags from a string - used by the search
+# generator to get an article without HTML tags
+class HTMLTagRemover (HTMLParser.HTMLParser):
+ def __init__ (self):
+ HTMLParser.HTMLParser.__init__ (self)
+ # initialize the list for data - this will be converted
+ # to a string before returning
+ self.data_list = []
+
+ # This event gets the data of the string that is anything that
+ # is not a HTML tag
+ def handle_data (self, data):
+ self.data_list.append (data)
+
+ # return the string thus collected by the handle_data event
+ def get_raw_string (self):
+ raw_string = "".join (self.data_list)
+ return raw_string
# to format the best rated articles in a HTML link list
def html_format_best_rated (best_rated):
@@ -32,6 +55,296 @@ def html_format_categories (cats):
str_items = "".join (items)
return str_items
+# to convert a rating number into rating images out of 10 stars
+def html_format_rating (rating):
+ items = []
+ # if -1 then return unrated as the text
+ if rating == -1:
+ return "unrated"
+ # fill up the number of stars for the rating
+ for i in range (rating):
+ items.append ('')
+ # fill up remaining slots (of 10) with grey stars
+ for i in range (10 - rating):
+ items.append ('')
+
+ rating_str = "".join (items)
+ return rating_str
+
+# function to generate the search index file
+def generate_search_index (dbname, conf, full_text_index = True):
+ # get all the articles
+ arts = biaweb_db.site_articles (dbname)
+ # if cannot get articles
+ if arts is False:
+ return False
+
+ # if full text index, then field should be article content otherwise keywords
+ if full_text_index:
+ searchfield = 4
+ else:
+ searchfield = 3
+
+ # initialize the search index dictionary
+ search_index = dict ()
+
+ # now run through the articles and generate a table of unique words (except
+ # stop words)
+ for art in arts:
+ # now strip out the HTML tags from the articles
+ parser = HTMLTagRemover ()
+ parser.feed (art[searchfield])
+ parser.close ()
+ # get the word list
+ word_list = parser.get_raw_string ().split ()
+
+ # now run through each word, make it lowercase, remove all cruft from it
+ # and add it to a dictionary
+ for word in word_list:
+ cleanword = word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t").lower ()
+ # if the word is not a "stop word", then add it to the search database
+ if cleanword not in biaweb_strings.stopwords:
+ # title of the search entry should be the article title
+ title = art[1]
+ # url should be the article URL: http://siteurl/Category/Article.html
+ url = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
+ # if search index has the word (as key)
+ if search_index.has_key (cleanword):
+ # add the title and url as a tuple to the set
+ search_index[cleanword].add ((title, url))
+ # create the key as the word
+ else:
+ # create a set for the keyword. Set will hold the tuples
+ # representing article title and url
+ search_index[cleanword] = set ()
+ search_index[cleanword].add ((title, url))
+
+ # done now write the search database as a python pickle object of search_index
+ search_index_path = os.path.join (conf[5], "cgi-bin", "searchwords.idx")
+ htaccess_path = os.path.join (conf[5], "cgi-bin", ".htaccess")
+ try:
+ # open the file in write binary mode
+ fsearchindex = open (search_index_path, "wb")
+ # dump the dictionary as a pickle object in binary mode
+ cPickle.dump (search_index, fsearchindex, 2)
+ fsearchindex.close ()
+ # write the htaccess file to prevent opening the index file from web browser
+ fhtaccess = open (htaccess_path, "w+")
+ fhtaccess.write (biaweb_strings.searchindex_htaccess)
+ fhtaccess.close ()
+ except (OSError, IOError):
+ return False
+
+ # finally return true
+ return True
+
+# function to copy additional files and folders to the destination path
+def copy_files_folders (conf, files_to_copy, folders_to_copy):
+ # create the cgi-bin directory and try to copy search.py into the destination directory if possible
+ # otherwise user must copy it manually
+ search_script_path = os.path.join (sys.path[0], "search.py")
+ if os.path.exists (search_script_path):
+ try:
+ os.mkdir (os.path.join (conf[5], "cgi-bin"))
+ shutil.copy2 (search_script_path, os.path.join(conf[5], "cgi-bin"))
+ except (IOError, OSError):
+ return False
+
+ # try to copy the star rating images to destination directory if possible
+ # otherwise user must copy it manually
+ rating_img_star = os.path.join (sys.path[0], "star.gif")
+ rating_img_greystar = os.path.join (sys.path[0], "star-grey.gif")
+ if os.path.exists (rating_img_star):
+ try:
+ shutil.copy2 (rating_img_star, conf[5])
+ except (IOError, OSError):
+ return False
+ if os.path.exists (rating_img_greystar):
+ try:
+ shutil.copy2 (rating_img_greystar, conf[5])
+ except (IOError, OSError):
+ return False
+
+ # additional files to copy
+
+ # first copy files
+ # check if files to copy is not empty
+ if files_to_copy <> []:
+ for src, dest in files_to_copy:
+ # get full path from relative path in dest
+ full_dest = os.path.join (conf[5], dest)
+ try:
+ shutil.copy2 (src, full_dest)
+ except (IOError, OSError):
+ return False
+
+ # additional folders to copy
+
+ # now copy the folders
+ if folders_to_copy <> []:
+ for src, dest in folders_to_copy:
+ # get full path from relative path in dest
+ full_dest = os.path.join (conf[5], dest)
+ try:
+ shutil.copytree (src, full_dest)
+ except (IOError, OSError):
+ return False
+
+ # finally return true
+ return True
+
+# function to generate article pages
+def generate_article_pages (dbname, conf, templates, category_str, bestrated_str):
+ # main template
+ tpl_main = string.Template (templates[0][1])
+ # article template
+ tpl_articlebit = string.Template (templates[1][1])
+
+ # get all articles from the database
+ articles = biaweb_db.site_articles (dbname)
+ if articles is False:
+ return
+
+ # walk through each article and generate the file in the appropriate category
+ # folder
+ for art in articles:
+ art_cdate = time.ctime (art[5])
+ art_mdate = time.ctime (art[6])
+ rating_str = html_format_rating (art[9])
+ # now build the article from the article bit template
+ article_str = tpl_articlebit.safe_substitute (article_title = art[1],
+ article_cdate = art_cdate,
+ article_mdate = art_mdate,
+ rating = rating_str,
+ article_contents = art[4])
+
+ # now build the article page
+ articlepage_str = tpl_main.safe_substitute (site_title = art[1],
+ site_url = "http://" + conf[0],
+ meta_keywords = art[3],
+ meta_description = art[2],
+ page_title = conf[1],
+ page_desc = conf[3],
+ contents_bit = article_str,
+ list_of_categories = category_str,
+ list_best_rated = bestrated_str,
+ copyright = conf[6])
+ # write to the article file
+ try:
+ farticle = open (os.path.join (conf[5], art[13], art[8] + ".html"), "w+")
+ farticle.write (articlepage_str)
+ except (OSError, IOError):
+ return False
+
+ # finally return true
+ return True
+
+# function to generate category directories and indices
+def generate_category_indices (dbname, conf, templates, category_str, bestrated_str, category_list):
+ # main template
+ tpl_main = string.Template (templates[0][1])
+ # table bit
+ tpl_tablebit = string.Template (templates[3][1])
+ # table row bit
+ tpl_trowbit = string.Template (templates[4][1])
+
+ # run through each category and generate category index page
+ for cat in category_list:
+ try:
+ # create the category directory
+ os.mkdir (os.path.join (conf[5], cat[3]))
+ except (IOError, OSError):
+ return False
+
+ # now get the list of articles for the specified category
+ articles_list = biaweb_db.site_articles (dbname, cat[0])
+ if articles_list is False:
+ return False
+
+ tableitems = []
+ # run through the list of articles in category
+ for art in articles_list:
+ url = art[13] + "/" + art[8] + ".html"
+ creattime = time.ctime (art[5])
+ rating_str = html_format_rating (art[9])
+ # now build the table rows for each article
+ tableitem_str = tpl_trowbit.safe_substitute (article_url = url,
+ title = art[1],
+ created = creattime,
+ rating = rating_str)
+ tableitems.append (tableitem_str)
+ # generate the rows as a string
+ tablerows_str = "".join (tableitems)
+
+ # now create the page template
+ table_str = tpl_tablebit.safe_substitute (category_title = cat[1],
+ category_desc = cat[2],
+ table_rows = tablerows_str)
+
+ # now create the index page
+ categoryindex_str = tpl_main.safe_substitute (site_title = conf[1] + " - " + cat[1],
+ site_url = "http://" + conf[0],
+ meta_keywords = conf[2],
+ meta_description = cat[2],
+ page_title = conf[1],
+ page_desc = conf[3],
+ contents_bit = table_str,
+ list_of_categories = category_str,
+ list_best_rated = bestrated_str,
+ copyright = conf[6])
+
+ # now write to Category/index.html
+ try:
+ fcatindex = open (os.path.join (conf[5], cat[3], "index.html"), "w+")
+ fcatindex.write (categoryindex_str)
+ fcatindex.close ()
+ except (OSError, IOError):
+ return False
+
+ # finally return true
+ return True
+
+# function to generate the RSS feed for the website
+def generate_rss_feed (dbname, conf):
+ # rss main template
+ tpl_rss = string.Template (biaweb_strings.template_rss)
+ # rss item bit template
+ tpl_rss_itembit = string.Template (biaweb_strings.template_rss_item)
+
+ # get the latest articles (limit by number of rss items)
+ arts = biaweb_db.site_latest_articles (dbname, conf[4])
+ if arts is False:
+ return False
+
+ rss_item_list = []
+ # run through the articles and generate the rss items
+ for art in arts:
+ # link
+ itemlink = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
+ item_str = tpl_rss_itembit.safe_substitute (item_title = art[1],
+ item_link = itemlink,
+ description = art[2])
+ rss_item_list.append (item_str)
+
+ # now get the rss items as a string
+ rss_item_str = "".join (rss_item_list)
+
+ # now generate the feed
+ rss_str = tpl_rss.safe_substitute (title = conf[1],
+ link = "http://" + conf[0],
+ description = conf[3],
+ rss_items = rss_item_str)
+
+ # now try to write it to the rss file
+ try:
+ frss = open (os.path.join (conf[5], "subscribe.xml"), "w+")
+ frss.write (rss_str)
+ except (IOError, OSError):
+ return False
+
+ # finally return true
+ return True
+
# function to generate main index file and stylesheet
def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
# main template
@@ -43,7 +356,7 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
# get the latest articles - conf[4] is num of rss entries to be used also
latest_arts = biaweb_db.site_latest_articles (dbname, conf[4])
- if latest_arts == False:
+ if latest_arts is False:
return False
news_items = []
@@ -86,7 +399,7 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
findex = open (os.path.join (conf[5], "index.html"), "w+")
findex.write (main_str)
findex.close ()
- except IOError, OSError:
+ except (IOError, OSError):
return False
# write the style.css file in the destination directory
@@ -94,8 +407,7 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
fstyle = open (os.path.join (conf[5], "style.css"), "w+")
fstyle.write (templates[5][1])
fstyle.close ()
- print "error"
- except IOError, OSError:
+ except (IOError, OSError):
return False
return True
@@ -104,13 +416,19 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True):
# get the configuration
conf = biaweb_db.get_configuration (dbname)
+ # if cannot get configuration
+ if conf is False:
+ return False
+
# get the templates
tpls = biaweb_db.get_templates (dbname)
+ if tpls is False:
+ return False
# get the list of categories
cats = biaweb_db.get_categories (dbname)
# cannot get categories return false
- if cats == False:
+ if cats is False:
return False
# format the categories as a html bulleted list
@@ -119,7 +437,7 @@ def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True
# get the best rated articles
best_rated = biaweb_db.site_get_bestrated (dbname)
# if cannot retrieve
- if best_rated == False:
+ if best_rated is False:
return False
# format the best rated articles as a html bulleted list
best_rated_str = html_format_best_rated (best_rated)
@@ -134,7 +452,32 @@ def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True
# generate the index page including style sheet
ret = generate_home_page (dbname, conf, tpls, cats_str, best_rated_str)
- if ret == False:
+ if ret is False:
+ return False
+
+ # generate the rss feed
+ ret = generate_rss_feed (dbname, conf)
+ if ret is False:
+ return False
+
+ # generate the category directories and indices
+ ret = generate_category_indices (dbname, conf, tpls, cats_str, best_rated_str, cats)
+ if ret is False:
+ return False
+
+ # generate the article pages
+ ret = generate_article_pages (dbname, conf, tpls, cats_str, best_rated_str)
+ if ret is False:
+ return False
+
+ # copy other files/folders into the destination path
+ ret = copy_files_folders (conf, files_to_copy, folders_to_copy)
+ if ret is False:
+ return False
+
+ # now generate the search index database
+ ret = generate_search_index (dbname, conf, search_type_full)
+ if ret is False:
return False
# finally when all is successfully done return true