X-Git-Url: https://harishankar.org/repos/?p=biaweb_qt.git;a=blobdiff_plain;f=biaweb_exporter.py;h=6cba7065096e61c2ab96244d5ab963de2df9b89f;hp=7c887c6c26642b099e63a9c93ba29d41e253f9ba;hb=HEAD;hpb=de54d2eb3b283030bd98f7a1f6d056ce2e56b4d7 diff --git a/biaweb_exporter.py b/biaweb_exporter.py index 7c887c6..6cba706 100644 --- a/biaweb_exporter.py +++ b/biaweb_exporter.py @@ -3,11 +3,34 @@ import os import os.path +import sys import time import sqlite3 import string import shutil +import HTMLParser +import cPickle import biaweb_db +import biaweb_strings + +# class to remove HTML tags from a string - used by the search +# generator to get an article without HTML tags +class HTMLTagRemover (HTMLParser.HTMLParser): + def __init__ (self): + HTMLParser.HTMLParser.__init__ (self) + # initialize the list for data - this will be converted + # to a string before returning + self.data_list = [] + + # This event gets the data of the string that is anything that + # is not a HTML tag + def handle_data (self, data): + self.data_list.append (data) + + # return the string thus collected by the handle_data event + def get_raw_string (self): + raw_string = "".join (self.data_list) + return raw_string # to format the best rated articles in a HTML link list def html_format_best_rated (best_rated): @@ -48,6 +71,174 @@ def html_format_rating (rating): rating_str = "".join (items) return rating_str +# function to generate the search index file +def generate_search_index (dbname, conf, full_text_index = True): + # get all the articles + arts = biaweb_db.site_articles (dbname) + # if cannot get articles + if arts is False: + return False + + # if full text index, then field should be article content otherwise keywords + if full_text_index: + searchfield = 4 + else: + searchfield = 3 + + # initialize the search index dictionary + search_index = dict () + + # now run through the articles and generate a table of unique words (except + # stop words) + for art in arts: + # now strip out the HTML tags from the articles + parser = HTMLTagRemover () + parser.feed (art[searchfield]) + parser.close () + # get the word list + word_list = parser.get_raw_string ().split () + + # now run through each word, make it lowercase, remove all cruft from it + # and add it to a dictionary + for word in word_list: + cleanword = word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t").lower () + # if the word is not a "stop word", then add it to the search database + if cleanword not in biaweb_strings.stopwords: + # title of the search entry should be the article title + title = art[1] + # url should be the article URL: http://siteurl/Category/Article.html + url = "http://" + conf[0] + art[13] + "/" + art[8] + ".html" + # if search index has the word (as key) + if search_index.has_key (cleanword): + # add the title and url as a tuple to the set + search_index[cleanword].add ((title, url)) + # create the key as the word + else: + # create a set for the keyword. Set will hold the tuples + # representing article title and url + search_index[cleanword] = set () + search_index[cleanword].add ((title, url)) + + # done now write the search database as a python pickle object of search_index + search_index_path = os.path.join (conf[5], "cgi-bin", "searchwords.idx") + htaccess_path = os.path.join (conf[5], "cgi-bin", ".htaccess") + try: + # open the file in write binary mode + fsearchindex = open (search_index_path, "wb") + # dump the dictionary as a pickle object in binary mode + cPickle.dump (search_index, fsearchindex, 2) + fsearchindex.close () + # write the htaccess file to prevent opening the index file from web browser + fhtaccess = open (htaccess_path, "w+") + fhtaccess.write (biaweb_strings.searchindex_htaccess) + fhtaccess.close () + except (OSError, IOError): + return False + + # finally return true + return True + +# function to copy additional files and folders to the destination path +def copy_files_folders (conf, files_to_copy, folders_to_copy): + # create the cgi-bin directory and try to copy search.py into the destination directory if possible + # otherwise user must copy it manually + search_script_path = os.path.join (sys.path[0], "search.py") + if os.path.exists (search_script_path): + try: + os.mkdir (os.path.join (conf[5], "cgi-bin")) + shutil.copy2 (search_script_path, os.path.join(conf[5], "cgi-bin")) + except (IOError, OSError): + return False + + # try to copy the star rating images to destination directory if possible + # otherwise user must copy it manually + rating_img_star = os.path.join (sys.path[0], "star.gif") + rating_img_greystar = os.path.join (sys.path[0], "star-grey.gif") + if os.path.exists (rating_img_star): + try: + shutil.copy2 (rating_img_star, conf[5]) + except (IOError, OSError): + return False + if os.path.exists (rating_img_greystar): + try: + shutil.copy2 (rating_img_greystar, conf[5]) + except (IOError, OSError): + return False + + # additional files to copy + + # first copy files + # check if files to copy is not empty + if files_to_copy <> []: + for src, dest in files_to_copy: + # get full path from relative path in dest + full_dest = os.path.join (conf[5], dest) + try: + shutil.copy2 (src, full_dest) + except (IOError, OSError): + return False + + # additional folders to copy + + # now copy the folders + if folders_to_copy <> []: + for src, dest in folders_to_copy: + # get full path from relative path in dest + full_dest = os.path.join (conf[5], dest) + try: + shutil.copytree (src, full_dest) + except (IOError, OSError): + return False + + # finally return true + return True + +# function to generate article pages +def generate_article_pages (dbname, conf, templates, category_str, bestrated_str): + # main template + tpl_main = string.Template (templates[0][1]) + # article template + tpl_articlebit = string.Template (templates[1][1]) + + # get all articles from the database + articles = biaweb_db.site_articles (dbname) + if articles is False: + return + + # walk through each article and generate the file in the appropriate category + # folder + for art in articles: + art_cdate = time.ctime (art[5]) + art_mdate = time.ctime (art[6]) + rating_str = html_format_rating (art[9]) + # now build the article from the article bit template + article_str = tpl_articlebit.safe_substitute (article_title = art[1], + article_cdate = art_cdate, + article_mdate = art_mdate, + rating = rating_str, + article_contents = art[4]) + + # now build the article page + articlepage_str = tpl_main.safe_substitute (site_title = art[1], + site_url = "http://" + conf[0], + meta_keywords = art[3], + meta_description = art[2], + page_title = conf[1], + page_desc = conf[3], + contents_bit = article_str, + list_of_categories = category_str, + list_best_rated = bestrated_str, + copyright = conf[6]) + # write to the article file + try: + farticle = open (os.path.join (conf[5], art[13], art[8] + ".html"), "w+") + farticle.write (articlepage_str) + except (OSError, IOError): + return False + + # finally return true + return True + # function to generate category directories and indices def generate_category_indices (dbname, conf, templates, category_str, bestrated_str, category_list): # main template @@ -62,12 +253,12 @@ def generate_category_indices (dbname, conf, templates, category_str, bestrated_ try: # create the category directory os.mkdir (os.path.join (conf[5], cat[3])) - except IOError, OSError: + except (IOError, OSError): return False # now get the list of articles for the specified category articles_list = biaweb_db.site_articles (dbname, cat[0]) - if articles_list == False: + if articles_list is False: return False tableitems = [] @@ -91,7 +282,7 @@ def generate_category_indices (dbname, conf, templates, category_str, bestrated_ table_rows = tablerows_str) # now create the index page - categoryindex_str = tpl_main.safe_substitute (site_title = conf[1], + categoryindex_str = tpl_main.safe_substitute (site_title = conf[1] + " - " + cat[1], site_url = "http://" + conf[0], meta_keywords = conf[2], meta_description = cat[2], @@ -107,12 +298,53 @@ def generate_category_indices (dbname, conf, templates, category_str, bestrated_ fcatindex = open (os.path.join (conf[5], cat[3], "index.html"), "w+") fcatindex.write (categoryindex_str) fcatindex.close () - except OSError, IOError: + except (OSError, IOError): return False # finally return true return True +# function to generate the RSS feed for the website +def generate_rss_feed (dbname, conf): + # rss main template + tpl_rss = string.Template (biaweb_strings.template_rss) + # rss item bit template + tpl_rss_itembit = string.Template (biaweb_strings.template_rss_item) + + # get the latest articles (limit by number of rss items) + arts = biaweb_db.site_latest_articles (dbname, conf[4]) + if arts is False: + return False + + rss_item_list = [] + # run through the articles and generate the rss items + for art in arts: + # link + itemlink = "http://" + conf[0] + art[13] + "/" + art[8] + ".html" + item_str = tpl_rss_itembit.safe_substitute (item_title = art[1], + item_link = itemlink, + description = art[2]) + rss_item_list.append (item_str) + + # now get the rss items as a string + rss_item_str = "".join (rss_item_list) + + # now generate the feed + rss_str = tpl_rss.safe_substitute (title = conf[1], + link = "http://" + conf[0], + description = conf[3], + rss_items = rss_item_str) + + # now try to write it to the rss file + try: + frss = open (os.path.join (conf[5], "subscribe.xml"), "w+") + frss.write (rss_str) + except (IOError, OSError): + return False + + # finally return true + return True + # function to generate main index file and stylesheet def generate_home_page (dbname, conf, templates, category_str, bestrated_str): # main template @@ -124,7 +356,7 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str): # get the latest articles - conf[4] is num of rss entries to be used also latest_arts = biaweb_db.site_latest_articles (dbname, conf[4]) - if latest_arts == False: + if latest_arts is False: return False news_items = [] @@ -167,7 +399,7 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str): findex = open (os.path.join (conf[5], "index.html"), "w+") findex.write (main_str) findex.close () - except IOError, OSError: + except (IOError, OSError): return False # write the style.css file in the destination directory @@ -175,7 +407,7 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str): fstyle = open (os.path.join (conf[5], "style.css"), "w+") fstyle.write (templates[5][1]) fstyle.close () - except IOError, OSError: + except (IOError, OSError): return False return True @@ -184,13 +416,19 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str): def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True): # get the configuration conf = biaweb_db.get_configuration (dbname) + # if cannot get configuration + if conf is False: + return False + # get the templates tpls = biaweb_db.get_templates (dbname) + if tpls is False: + return False # get the list of categories cats = biaweb_db.get_categories (dbname) # cannot get categories return false - if cats == False: + if cats is False: return False # format the categories as a html bulleted list @@ -199,7 +437,7 @@ def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True # get the best rated articles best_rated = biaweb_db.site_get_bestrated (dbname) # if cannot retrieve - if best_rated == False: + if best_rated is False: return False # format the best rated articles as a html bulleted list best_rated_str = html_format_best_rated (best_rated) @@ -214,11 +452,32 @@ def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True # generate the index page including style sheet ret = generate_home_page (dbname, conf, tpls, cats_str, best_rated_str) - if ret == False: + if ret is False: return False + # generate the rss feed + ret = generate_rss_feed (dbname, conf) + if ret is False: + return False + + # generate the category directories and indices ret = generate_category_indices (dbname, conf, tpls, cats_str, best_rated_str, cats) - if ret == False: + if ret is False: + return False + + # generate the article pages + ret = generate_article_pages (dbname, conf, tpls, cats_str, best_rated_str) + if ret is False: + return False + + # copy other files/folders into the destination path + ret = copy_files_folders (conf, files_to_copy, folders_to_copy) + if ret is False: + return False + + # now generate the search index database + ret = generate_search_index (dbname, conf, search_type_full) + if ret is False: return False # finally when all is successfully done return true