Minor changes to site generator

[biaweb_qt.git] / biaweb_exporter.py
diff --git a/biaweb_exporter.py b/biaweb_exporter.py

index 4560181..6cba706 100644 (file)
--- a/biaweb_exporter.py
+++ b/biaweb_exporter.py
@@ -8,7 +8,29 @@ import time
  import sqlite3
  import string
  import shutil
+import HTMLParser
+import cPickle
  import biaweb_db
+import biaweb_strings
+
+# class to remove HTML tags from a string - used by the search
+# generator to get an article without HTML tags
+class HTMLTagRemover (HTMLParser.HTMLParser):
+       def __init__ (self):
+               HTMLParser.HTMLParser.__init__ (self)
+               # initialize the list for data - this will be converted
+               # to a string before returning
+               self.data_list = []
+
+       # This event gets the data of the string that is anything that
+       # is not a HTML tag
+       def handle_data (self, data):
+               self.data_list.append (data)
+
+       # return the string thus collected by the handle_data event
+       def get_raw_string (self):
+               raw_string = "".join (self.data_list)
+               return raw_string
  
  # to format the best rated articles in a HTML link list
  def html_format_best_rated (best_rated):
@@ -49,6 +71,73 @@ def html_format_rating (rating):
         rating_str = "".join (items)
         return rating_str
  
+# function to generate the search index file
+def generate_search_index (dbname, conf, full_text_index = True):
+       # get all the articles
+       arts = biaweb_db.site_articles (dbname)
+       # if cannot get articles
+       if arts is False:
+               return False
+
+       # if full text index, then field should be article content otherwise keywords
+       if full_text_index:
+               searchfield = 4
+       else:
+               searchfield = 3
+
+       # initialize the search index dictionary
+       search_index = dict ()
+
+       # now run through the articles and generate a table of unique words (except
+       # stop words)
+       for art in arts:
+               # now strip out the HTML tags from the articles
+               parser = HTMLTagRemover ()
+               parser.feed (art[searchfield])
+               parser.close ()
+               # get the word list
+               word_list = parser.get_raw_string ().split ()
+
+               # now run through each word, make it lowercase, remove all cruft from it
+               # and add it to a dictionary
+               for word in word_list:
+                       cleanword = word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t").lower ()
+                       # if the word is not a "stop word", then add it to the search database
+                       if cleanword not in biaweb_strings.stopwords:
+                               # title of the search entry should be the article title
+                               title = art[1]
+                               # url should be the article URL: http://siteurl/Category/Article.html
+                               url = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
+                               # if search index has the word (as key)
+                               if search_index.has_key (cleanword):
+                                       # add the title and url as a tuple to the set
+                                       search_index[cleanword].add ((title, url))
+                               # create the key as the word
+                               else:
+                                       # create a set for the keyword. Set will hold the tuples
+                                       # representing article title and url
+                                       search_index[cleanword] = set ()
+                                       search_index[cleanword].add ((title, url))
+
+       # done now write the search database as a python pickle object of search_index
+       search_index_path = os.path.join (conf[5], "cgi-bin", "searchwords.idx")
+       htaccess_path = os.path.join (conf[5], "cgi-bin", ".htaccess")
+       try:
+               # open the file in write binary mode
+               fsearchindex = open (search_index_path, "wb")
+               # dump the dictionary as a pickle object in binary mode
+               cPickle.dump (search_index, fsearchindex, 2)
+               fsearchindex.close ()
+               # write the htaccess file to prevent opening the index file from web browser
+               fhtaccess = open (htaccess_path, "w+")
+               fhtaccess.write (biaweb_strings.searchindex_htaccess)
+               fhtaccess.close ()
+       except (OSError, IOError):
+               return False
+
+       # finally return true
+       return True
+
  # function to copy additional files and folders to the destination path
  def copy_files_folders (conf, files_to_copy, folders_to_copy):
         # create the cgi-bin directory and try to copy search.py into the destination directory if possible
@@ -58,7 +147,7 @@ def copy_files_folders (conf, files_to_copy, folders_to_copy):
                 try:
                         os.mkdir (os.path.join (conf[5], "cgi-bin"))
                         shutil.copy2 (search_script_path, os.path.join(conf[5], "cgi-bin"))
-               except IOError, OSError:
+               except (IOError, OSError):
                         return False
  
         # try to copy the star rating images  to destination directory if possible
@@ -68,12 +157,12 @@ def copy_files_folders (conf, files_to_copy, folders_to_copy):
         if os.path.exists (rating_img_star):
                 try:
                         shutil.copy2 (rating_img_star, conf[5])
-               except IOError, OSError:
+               except (IOError, OSError):
                         return False
         if os.path.exists (rating_img_greystar):
                 try:
                         shutil.copy2 (rating_img_greystar, conf[5])
-               except IOError, OSError:
+               except (IOError, OSError):
                         return False
  
         # additional files to copy
@@ -86,7 +175,7 @@ def copy_files_folders (conf, files_to_copy, folders_to_copy):
                         full_dest = os.path.join (conf[5], dest)
                         try:
                                 shutil.copy2 (src, full_dest)
-                       except IOError, OSError:
+                       except (IOError, OSError):
                                 return False
  
         # additional folders to copy
@@ -98,14 +187,12 @@ def copy_files_folders (conf, files_to_copy, folders_to_copy):
                         full_dest = os.path.join (conf[5], dest)
                         try:
                                 shutil.copytree (src, full_dest)
-                       except IOError, OSError:
+                       except (IOError, OSError):
                                 return False
  
         # finally return true
         return True
  
-
-
  # function to generate article pages
  def generate_article_pages (dbname, conf, templates, category_str, bestrated_str):
         # main template
@@ -115,7 +202,7 @@ def generate_article_pages (dbname, conf, templates, category_str, bestrated_str
  
         # get all articles from the database
         articles = biaweb_db.site_articles (dbname)
-       if articles == False:
+       if articles is False:
                 return
  
         # walk through each article and generate the file in the appropriate category
@@ -132,7 +219,7 @@ def generate_article_pages (dbname, conf, templates, category_str, bestrated_str
                                                                                                         article_contents = art[4])
  
                 # now build the article page
-               articlepage_str = tpl_main.safe_substitute (site_title = conf[1],
+               articlepage_str = tpl_main.safe_substitute (site_title = art[1],
                                                                                                         site_url = "http://" + conf[0],
                                                                                                         meta_keywords = art[3],
                                                                                                         meta_description = art[2],
@@ -146,7 +233,7 @@ def generate_article_pages (dbname, conf, templates, category_str, bestrated_str
                 try:
                         farticle = open (os.path.join (conf[5], art[13], art[8] + ".html"), "w+")
                         farticle.write (articlepage_str)
-               except OSError, IOError:
+               except (OSError, IOError):
                         return False
  
         # finally return true
@@ -166,12 +253,12 @@ def generate_category_indices (dbname, conf, templates, category_str, bestrated_
                 try:
                         # create the category directory
                         os.mkdir (os.path.join (conf[5], cat[3]))
-               except IOError, OSError:
+               except (IOError, OSError):
                         return False
  
                 # now get the list of articles for the specified category
                 articles_list = biaweb_db.site_articles (dbname, cat[0])
-               if articles_list == False:
+               if articles_list is False:
                         return False
  
                 tableitems = []
@@ -195,7 +282,7 @@ def generate_category_indices (dbname, conf, templates, category_str, bestrated_
                                                                                                                 table_rows = tablerows_str)
  
                 # now create the index page
-               categoryindex_str =  tpl_main.safe_substitute (site_title = conf[1],
+               categoryindex_str =  tpl_main.safe_substitute (site_title = conf[1] + " - " + cat[1],
                                                                                                                 site_url = "http://" + conf[0],
                                                                                                                 meta_keywords = conf[2],
                                                                                                                 meta_description = cat[2],
@@ -211,12 +298,53 @@ def generate_category_indices (dbname, conf, templates, category_str, bestrated_
                         fcatindex = open (os.path.join (conf[5], cat[3], "index.html"), "w+")
                         fcatindex.write (categoryindex_str)
                         fcatindex.close ()
-               except OSError, IOError:
+               except (OSError, IOError):
                         return False
  
         # finally return true
         return True
  
+# function to generate the RSS feed for the website
+def generate_rss_feed (dbname, conf):
+       # rss main template
+       tpl_rss = string.Template (biaweb_strings.template_rss)
+       # rss item bit template
+       tpl_rss_itembit = string.Template (biaweb_strings.template_rss_item)
+
+       # get the latest articles (limit by number of rss items)
+       arts = biaweb_db.site_latest_articles (dbname, conf[4])
+       if arts is False:
+               return False
+
+       rss_item_list = []
+       # run through the articles and generate the rss items
+       for art in arts:
+               # link
+               itemlink = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
+               item_str = tpl_rss_itembit.safe_substitute (item_title = art[1],
+                                                                                                       item_link = itemlink,
+                                                                                                       description = art[2])
+               rss_item_list.append (item_str)
+
+       # now get the rss items as a string
+       rss_item_str = "".join (rss_item_list)
+
+       # now generate the feed
+       rss_str = tpl_rss.safe_substitute (title = conf[1],
+                                                                               link = "http://" + conf[0],
+                                                                               description = conf[3],
+                                                                               rss_items = rss_item_str)
+
+       # now try to write it to the rss file
+       try:
+               frss = open (os.path.join (conf[5], "subscribe.xml"), "w+")
+               frss.write (rss_str)
+       except (IOError, OSError):
+               return False
+
+       # finally return true
+       return True
+
  # function to generate main index file and stylesheet
  def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
         # main template
@@ -228,7 +356,7 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
  
         # get the latest articles - conf[4] is num of rss entries to be used also
         latest_arts = biaweb_db.site_latest_articles (dbname, conf[4])
-       if latest_arts == False:
+       if latest_arts is False:
                 return False
  
         news_items = []
@@ -271,7 +399,7 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
                 findex = open (os.path.join (conf[5], "index.html"), "w+")
                 findex.write (main_str)
                 findex.close ()
-       except IOError, OSError:
+       except (IOError, OSError):
                 return False
  
         # write the style.css file in the destination directory
@@ -279,7 +407,7 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
                 fstyle = open (os.path.join (conf[5], "style.css"), "w+")
                 fstyle.write (templates[5][1])
                 fstyle.close ()
-       except IOError, OSError:
+       except (IOError, OSError):
                 return False
  
         return True
@@ -288,13 +416,19 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
  def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True):
         # get the configuration
         conf = biaweb_db.get_configuration (dbname)
+       # if cannot get configuration
+       if conf is False:
+               return False
+
         # get the templates
         tpls = biaweb_db.get_templates (dbname)
+       if tpls is False:
+               return False
  
         # get the list of categories
         cats = biaweb_db.get_categories (dbname)
         # cannot get categories return false
-       if cats == False:
+       if cats is False:
                 return False
  
         # format the categories as a html bulleted list
@@ -303,7 +437,7 @@ def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True
         # get the best rated articles
         best_rated = biaweb_db.site_get_bestrated (dbname)
         # if cannot retrieve
-       if best_rated == False:
+       if best_rated is False:
                 return False
         # format the best rated articles as a html bulleted list
         best_rated_str = html_format_best_rated (best_rated)
@@ -318,22 +452,32 @@ def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True
  
         # generate the index page including style sheet
         ret = generate_home_page (dbname, conf, tpls, cats_str, best_rated_str)
-       if ret == False:
+       if ret is False:
+               return False
+
+       # generate the rss feed
+       ret = generate_rss_feed (dbname, conf)
+       if ret is False:
                 return False
  
         # generate the category directories and indices
         ret = generate_category_indices (dbname, conf, tpls, cats_str, best_rated_str, cats)
-       if ret == False:
+       if ret is False:
                 return False
  
         # generate the article pages
         ret = generate_article_pages (dbname, conf, tpls, cats_str, best_rated_str)
-       if ret == False:
+       if ret is False:
                 return False
  
         # copy other files/folders into the destination path
         ret = copy_files_folders (conf, files_to_copy, folders_to_copy)
-       if ret == False:
+       if ret is False:
+               return False
+
+       # now generate the search index database
+       ret = generate_search_index (dbname, conf, search_type_full)
+       if ret is False:
                 return False
  
         # finally when all is successfully done return true