Implemented the search database exporting
[biaweb_qt.git] / biaweb_exporter.py
index e824c08..47ccead 100644 (file)
@@ -3,11 +3,34 @@
 
 import os
 import os.path
+import sys
 import time
 import sqlite3
 import string
 import shutil
+import HTMLParser
+import cPickle
 import biaweb_db
+import biaweb_strings
+
+# class to remove HTML tags from a string - used by the search
+# generator to get an article without HTML tags
+class HTMLTagRemover (HTMLParser.HTMLParser):
+       def __init__ (self):
+               HTMLParser.HTMLParser.__init__ (self)
+               # initialize the list for data - this will be converted
+               # to a string before returning
+               self.data_list = []
+
+       # This event gets the data of the string that is anything that
+       # is not a HTML tag
+       def handle_data (self, data):
+               self.data_list.append (data)
+
+       # return the string thus collected by the handle_data event
+       def get_raw_string (self):
+               raw_string = "".join (self.data_list)
+               return raw_string
 
 # to format the best rated articles in a HTML link list
 def html_format_best_rated (best_rated):
@@ -48,6 +71,128 @@ def html_format_rating (rating):
        rating_str = "".join (items)
        return rating_str
 
+# function to generate the search index file
+def generate_search_index (dbname, conf, full_text_index = True):
+       # get all the articles
+       arts = biaweb_db.site_articles (dbname)
+       # if cannot get articles
+       if arts == False:
+               return False
+
+       # if full text index, then field should be article content otherwise keywords
+       if full_text_index:
+               searchfield = 4
+       else:
+               searchfield = 3
+
+       # initialize the search index dictionary
+       search_index = dict ()
+
+       # now run through the articles and generate a table of unique words (except
+       # stop words)
+       for art in arts:
+               # now strip out the HTML tags from the articles
+               parser = HTMLTagRemover ()
+               parser.feed (art[searchfield])
+               parser.close ()
+               # get the word list
+               word_list = parser.get_raw_string ().split ()
+
+               # now run through each word, make it lowercase, remove all cruft from it
+               # and add it to a dictionary
+               for word in word_list:
+                       cleanword = word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t").lower ()
+                       # if the word is not a "stop word", then add it to the search database
+                       if cleanword not in biaweb_strings.stopwords:
+                               # title of the search entry should be the article title
+                               title = art[1]
+                               # url should be the article URL: http://siteurl/Category/Article.html
+                               url = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
+                               # if search index has the word (as key)
+                               if search_index.has_key (cleanword):
+                                       # add the title and url as a tuple to the set
+                                       search_index[cleanword].add ((title, url))
+                               # create the key as the word
+                               else:
+                                       # create a set for the keyword. Set will hold the tuples
+                                       # representing article title and url
+                                       search_index[cleanword] = set ()
+                                       search_index[cleanword].add ((title, url))
+
+       # done now write the search database as a python pickle object of search_index
+       search_index_path = os.path.join (conf[5], "cgi-bin", "searchwords.idx")
+       htaccess_path = os.path.join (conf[5], "cgi-bin", ".htaccess")
+       try:
+               # open the file in write binary mode
+               fsearchindex = open (search_index_path, "wb")
+               # dump the dictionary as a pickle object in binary mode
+               cPickle.dump (search_index, fsearchindex, 2)
+               fsearchindex.close ()
+               # write the htaccess file to prevent opening the index file from web browser
+               fhtaccess = open (htaccess_path, "w+")
+               fhtaccess.write (biaweb_strings.searchindex_htaccess)
+               fhtaccess.close ()
+       except OSError, IOError:
+               return False
+
+       # finally return true
+       return True
+
+# function to copy additional files and folders to the destination path
+def copy_files_folders (conf, files_to_copy, folders_to_copy):
+       # create the cgi-bin directory and try to copy search.py into the destination directory if possible
+       # otherwise user must copy it manually
+       search_script_path = os.path.join (sys.path[0], "search.py")
+       if os.path.exists (search_script_path):
+               try:
+                       os.mkdir (os.path.join (conf[5], "cgi-bin"))
+                       shutil.copy2 (search_script_path, os.path.join(conf[5], "cgi-bin"))
+               except IOError, OSError:
+                       return False
+
+       # try to copy the star rating images  to destination directory if possible
+       # otherwise user must copy it manually
+       rating_img_star = os.path.join (sys.path[0], "star.gif")
+       rating_img_greystar = os.path.join (sys.path[0], "star-grey.gif")
+       if os.path.exists (rating_img_star):
+               try:
+                       shutil.copy2 (rating_img_star, conf[5])
+               except IOError, OSError:
+                       return False
+       if os.path.exists (rating_img_greystar):
+               try:
+                       shutil.copy2 (rating_img_greystar, conf[5])
+               except IOError, OSError:
+                       return False
+
+       # additional files to copy
+
+       # first copy files
+       # check if files to copy is not empty
+       if files_to_copy <> []:
+               for src, dest in files_to_copy:
+                       # get full path from relative path in dest
+                       full_dest = os.path.join (conf[5], dest)
+                       try:
+                               shutil.copy2 (src, full_dest)
+                       except IOError, OSError:
+                               return False
+
+       # additional folders to copy
+
+       # now copy the folders
+       if folders_to_copy <> []:
+               for src, dest in folders_to_copy:
+                       # get full path from relative path in dest
+                       full_dest = os.path.join (conf[5], dest)
+                       try:
+                               shutil.copytree (src, full_dest)
+                       except IOError, OSError:
+                               return False
+
+       # finally return true
+       return True
+
 # function to generate article pages
 def generate_article_pages (dbname, conf, templates, category_str, bestrated_str):
        # main template
@@ -273,5 +418,15 @@ def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True
        if ret == False:
                return False
 
+       # copy other files/folders into the destination path
+       ret = copy_files_folders (conf, files_to_copy, folders_to_copy)
+       if ret == False:
+               return False
+
+       # now generate the search index database
+       ret = generate_search_index (dbname, conf, search_type_full)
+       if ret == False:
+               return False
+
        # finally when all is successfully done return true
        return True