Updated htaccess to Apache 2.4 compat
[biaweb_qt.git] / biaweb_exporter.py
1 # BiaWeb Website content manager (c) 2010 V.Harishankar
2 # Site exporter/generator class
3
4 import os
5 import os.path
6 import sys
7 import time
8 import sqlite3
9 import string
10 import shutil
11 import HTMLParser
12 import cPickle
13 import biaweb_db
14 import biaweb_strings
15
16 # class to remove HTML tags from a string - used by the search
17 # generator to get an article without HTML tags
18 class HTMLTagRemover (HTMLParser.HTMLParser):
19 def __init__ (self):
20 HTMLParser.HTMLParser.__init__ (self)
21 # initialize the list for data - this will be converted
22 # to a string before returning
23 self.data_list = []
24
25 # This event gets the data of the string that is anything that
26 # is not a HTML tag
27 def handle_data (self, data):
28 self.data_list.append (data)
29
30 # return the string thus collected by the handle_data event
31 def get_raw_string (self):
32 raw_string = "".join (self.data_list)
33 return raw_string
34
35 # to format the best rated articles in a HTML link list
36 def html_format_best_rated (best_rated):
37 items = [ "<ul>\n", ]
38 for art in best_rated:
39 # art[13] is category stub, art[8] is article stub
40 # thus forming the relative url as Category/Article.html
41 str_art = '<li><a href="' + art[13] + '/' + art[8] + '.html">' + art[1] + '</a></li>\n'
42 items.append (str_art)
43 items.append ("</ul>\n")
44 str_items = "".join (items)
45 return str_items
46
47 # to format categories in a HTML link list
48 def html_format_categories (cats):
49 items = [ "<ul>\n", ]
50 for cat in cats:
51 # cat[3] is category stub and cat[1] is category name
52 str_cat = '<li><a href="' + cat[3] + '/">' + cat[1] + '</a></li>\n'
53 items.append (str_cat)
54 items.append ("</ul>\n")
55 str_items = "".join (items)
56 return str_items
57
58 # to convert a rating number into rating images out of 10 stars
59 def html_format_rating (rating):
60 items = []
61 # if -1 then return unrated as the text
62 if rating == -1:
63 return "unrated"
64 # fill up the number of stars for the rating
65 for i in range (rating):
66 items.append ('<img src="star.gif" alt="*" />')
67 # fill up remaining slots (of 10) with grey stars
68 for i in range (10 - rating):
69 items.append ('<img src="star-grey.gif" alt="-" />')
70
71 rating_str = "".join (items)
72 return rating_str
73
74 # function to generate the search index file
75 def generate_search_index (dbname, conf, full_text_index = True):
76 # get all the articles
77 arts = biaweb_db.site_articles (dbname)
78 # if cannot get articles
79 if arts is False:
80 return False
81
82 # if full text index, then field should be article content otherwise keywords
83 if full_text_index:
84 searchfield = 4
85 else:
86 searchfield = 3
87
88 # initialize the search index dictionary
89 search_index = dict ()
90
91 # now run through the articles and generate a table of unique words (except
92 # stop words)
93 for art in arts:
94 # now strip out the HTML tags from the articles
95 parser = HTMLTagRemover ()
96 parser.feed (art[searchfield])
97 parser.close ()
98 # get the word list
99 word_list = parser.get_raw_string ().split ()
100
101 # now run through each word, make it lowercase, remove all cruft from it
102 # and add it to a dictionary
103 for word in word_list:
104 cleanword = word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t").lower ()
105 # if the word is not a "stop word", then add it to the search database
106 if cleanword not in biaweb_strings.stopwords:
107 # title of the search entry should be the article title
108 title = art[1]
109 # url should be the article URL: http://siteurl/Category/Article.html
110 url = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
111 # if search index has the word (as key)
112 if search_index.has_key (cleanword):
113 # add the title and url as a tuple to the set
114 search_index[cleanword].add ((title, url))
115 # create the key as the word
116 else:
117 # create a set for the keyword. Set will hold the tuples
118 # representing article title and url
119 search_index[cleanword] = set ()
120 search_index[cleanword].add ((title, url))
121
122 # done now write the search database as a python pickle object of search_index
123 search_index_path = os.path.join (conf[5], "cgi-bin", "searchwords.idx")
124 htaccess_path = os.path.join (conf[5], "cgi-bin", ".htaccess")
125 try:
126 # open the file in write binary mode
127 fsearchindex = open (search_index_path, "wb")
128 # dump the dictionary as a pickle object in binary mode
129 cPickle.dump (search_index, fsearchindex, 2)
130 fsearchindex.close ()
131 # write the htaccess file to prevent opening the index file from web browser
132 fhtaccess = open (htaccess_path, "w+")
133 fhtaccess.write (biaweb_strings.searchindex_htaccess)
134 fhtaccess.close ()
135 except (OSError, IOError):
136 return False
137
138 # finally return true
139 return True
140
141 # function to copy additional files and folders to the destination path
142 def copy_files_folders (conf, files_to_copy, folders_to_copy):
143 # create the cgi-bin directory and try to copy search.py into the destination directory if possible
144 # otherwise user must copy it manually
145 search_script_path = os.path.join (sys.path[0], "search.py")
146 if os.path.exists (search_script_path):
147 try:
148 os.mkdir (os.path.join (conf[5], "cgi-bin"))
149 shutil.copy2 (search_script_path, os.path.join(conf[5], "cgi-bin"))
150 except (IOError, OSError):
151 return False
152
153 # try to copy the star rating images to destination directory if possible
154 # otherwise user must copy it manually
155 rating_img_star = os.path.join (sys.path[0], "star.gif")
156 rating_img_greystar = os.path.join (sys.path[0], "star-grey.gif")
157 if os.path.exists (rating_img_star):
158 try:
159 shutil.copy2 (rating_img_star, conf[5])
160 except (IOError, OSError):
161 return False
162 if os.path.exists (rating_img_greystar):
163 try:
164 shutil.copy2 (rating_img_greystar, conf[5])
165 except (IOError, OSError):
166 return False
167
168 # additional files to copy
169
170 # first copy files
171 # check if files to copy is not empty
172 if files_to_copy <> []:
173 for src, dest in files_to_copy:
174 # get full path from relative path in dest
175 full_dest = os.path.join (conf[5], dest)
176 try:
177 shutil.copy2 (src, full_dest)
178 except (IOError, OSError):
179 return False
180
181 # additional folders to copy
182
183 # now copy the folders
184 if folders_to_copy <> []:
185 for src, dest in folders_to_copy:
186 # get full path from relative path in dest
187 full_dest = os.path.join (conf[5], dest)
188 try:
189 shutil.copytree (src, full_dest)
190 except (IOError, OSError):
191 return False
192
193 # finally return true
194 return True
195
196 # function to generate article pages
197 def generate_article_pages (dbname, conf, templates, category_str, bestrated_str):
198 # main template
199 tpl_main = string.Template (templates[0][1])
200 # article template
201 tpl_articlebit = string.Template (templates[1][1])
202
203 # get all articles from the database
204 articles = biaweb_db.site_articles (dbname)
205 if articles is False:
206 return
207
208 # walk through each article and generate the file in the appropriate category
209 # folder
210 for art in articles:
211 art_cdate = time.ctime (art[5])
212 art_mdate = time.ctime (art[6])
213 rating_str = html_format_rating (art[9])
214 # now build the article from the article bit template
215 article_str = tpl_articlebit.safe_substitute (article_title = art[1],
216 article_cdate = art_cdate,
217 article_mdate = art_mdate,
218 rating = rating_str,
219 article_contents = art[4])
220
221 # now build the article page
222 articlepage_str = tpl_main.safe_substitute (site_title = art[1],
223 site_url = "http://" + conf[0],
224 meta_keywords = art[3],
225 meta_description = art[2],
226 page_title = conf[1],
227 page_desc = conf[3],
228 contents_bit = article_str,
229 list_of_categories = category_str,
230 list_best_rated = bestrated_str,
231 copyright = conf[6])
232 # write to the article file
233 try:
234 farticle = open (os.path.join (conf[5], art[13], art[8] + ".html"), "w+")
235 farticle.write (articlepage_str)
236 except (OSError, IOError):
237 return False
238
239 # finally return true
240 return True
241
242 # function to generate category directories and indices
243 def generate_category_indices (dbname, conf, templates, category_str, bestrated_str, category_list):
244 # main template
245 tpl_main = string.Template (templates[0][1])
246 # table bit
247 tpl_tablebit = string.Template (templates[3][1])
248 # table row bit
249 tpl_trowbit = string.Template (templates[4][1])
250
251 # run through each category and generate category index page
252 for cat in category_list:
253 try:
254 # create the category directory
255 os.mkdir (os.path.join (conf[5], cat[3]))
256 except (IOError, OSError):
257 return False
258
259 # now get the list of articles for the specified category
260 articles_list = biaweb_db.site_articles (dbname, cat[0])
261 if articles_list is False:
262 return False
263
264 tableitems = []
265 # run through the list of articles in category
266 for art in articles_list:
267 url = art[13] + "/" + art[8] + ".html"
268 creattime = time.ctime (art[5])
269 rating_str = html_format_rating (art[9])
270 # now build the table rows for each article
271 tableitem_str = tpl_trowbit.safe_substitute (article_url = url,
272 title = art[1],
273 created = creattime,
274 rating = rating_str)
275 tableitems.append (tableitem_str)
276 # generate the rows as a string
277 tablerows_str = "".join (tableitems)
278
279 # now create the page template
280 table_str = tpl_tablebit.safe_substitute (category_title = cat[1],
281 category_desc = cat[2],
282 table_rows = tablerows_str)
283
284 # now create the index page
285 categoryindex_str = tpl_main.safe_substitute (site_title = conf[1] + " - " + cat[1],
286 site_url = "http://" + conf[0],
287 meta_keywords = conf[2],
288 meta_description = cat[2],
289 page_title = conf[1],
290 page_desc = conf[3],
291 contents_bit = table_str,
292 list_of_categories = category_str,
293 list_best_rated = bestrated_str,
294 copyright = conf[6])
295
296 # now write to Category/index.html
297 try:
298 fcatindex = open (os.path.join (conf[5], cat[3], "index.html"), "w+")
299 fcatindex.write (categoryindex_str)
300 fcatindex.close ()
301 except (OSError, IOError):
302 return False
303
304 # finally return true
305 return True
306
307 # function to generate the RSS feed for the website
308 def generate_rss_feed (dbname, conf):
309 # rss main template
310 tpl_rss = string.Template (biaweb_strings.template_rss)
311 # rss item bit template
312 tpl_rss_itembit = string.Template (biaweb_strings.template_rss_item)
313
314 # get the latest articles (limit by number of rss items)
315 arts = biaweb_db.site_latest_articles (dbname, conf[4])
316 if arts is False:
317 return False
318
319 rss_item_list = []
320 # run through the articles and generate the rss items
321 for art in arts:
322 # link
323 itemlink = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
324 item_str = tpl_rss_itembit.safe_substitute (item_title = art[1],
325 item_link = itemlink,
326 description = art[2])
327 rss_item_list.append (item_str)
328
329 # now get the rss items as a string
330 rss_item_str = "".join (rss_item_list)
331
332 # now generate the feed
333 rss_str = tpl_rss.safe_substitute (title = conf[1],
334 link = "http://" + conf[0],
335 description = conf[3],
336 rss_items = rss_item_str)
337
338 # now try to write it to the rss file
339 try:
340 frss = open (os.path.join (conf[5], "subscribe.xml"), "w+")
341 frss.write (rss_str)
342 except (IOError, OSError):
343 return False
344
345 # finally return true
346 return True
347
348 # function to generate main index file and stylesheet
349 def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
350 # main template
351 tpl_main = string.Template (templates[0][1])
352 # index bit
353 tpl_indexbit = string.Template (templates[6][1])
354 # news bits
355 tpl_newsbit = string.Template (templates[2][1])
356
357 # get the latest articles - conf[4] is num of rss entries to be used also
358 latest_arts = biaweb_db.site_latest_articles (dbname, conf[4])
359 if latest_arts is False:
360 return False
361
362 news_items = []
363
364 # Run through the latest articles
365 # for the num of latest news items on index
366 for art in latest_arts:
367 # url is Category/Article.html
368 url = art[13] + "/" + art[8] + ".html"
369 # art[5] is creation time
370 strdate = time.ctime (art[5])
371 # now populate the template variables. art[1] is title, art[2] is summary
372 strnews = tpl_newsbit.safe_substitute (news_title = art[1],
373 news_link = url,
374 news_datetime = strdate,
375 news_description = art[2]
376 )
377 news_items.append (strnews)
378 # now convert it into a string
379 newsbit_str = "".join (news_items)
380
381 # now populate the index template
382 indexbit_str = tpl_indexbit.safe_substitute (site_name = conf[1],
383 news_updates = newsbit_str
384 )
385 # now populate the main page template
386 main_str = tpl_main.safe_substitute (site_title = conf[1],
387 site_url = "http://" + conf[0],
388 meta_keywords = conf[2],
389 meta_description = conf[3],
390 page_title = conf[1],
391 page_desc = conf[3],
392 contents_bit = indexbit_str,
393 list_of_categories = category_str,
394 list_best_rated = bestrated_str,
395 copyright = conf[6])
396
397 # write the index.html file in the destination directory
398 try:
399 findex = open (os.path.join (conf[5], "index.html"), "w+")
400 findex.write (main_str)
401 findex.close ()
402 except (IOError, OSError):
403 return False
404
405 # write the style.css file in the destination directory
406 try:
407 fstyle = open (os.path.join (conf[5], "style.css"), "w+")
408 fstyle.write (templates[5][1])
409 fstyle.close ()
410 except (IOError, OSError):
411 return False
412
413 return True
414
415 # superfunction to generate the site
416 def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True):
417 # get the configuration
418 conf = biaweb_db.get_configuration (dbname)
419 # if cannot get configuration
420 if conf is False:
421 return False
422
423 # get the templates
424 tpls = biaweb_db.get_templates (dbname)
425 if tpls is False:
426 return False
427
428 # get the list of categories
429 cats = biaweb_db.get_categories (dbname)
430 # cannot get categories return false
431 if cats is False:
432 return False
433
434 # format the categories as a html bulleted list
435 cats_str = html_format_categories (cats)
436
437 # get the best rated articles
438 best_rated = biaweb_db.site_get_bestrated (dbname)
439 # if cannot retrieve
440 if best_rated is False:
441 return False
442 # format the best rated articles as a html bulleted list
443 best_rated_str = html_format_best_rated (best_rated)
444
445 # remove the destination tree and recreate it
446 try:
447 if os.path.exists (conf[5]):
448 shutil.rmtree (conf[5])
449 os.mkdir (conf[5])
450 except OSError:
451 return False
452
453 # generate the index page including style sheet
454 ret = generate_home_page (dbname, conf, tpls, cats_str, best_rated_str)
455 if ret is False:
456 return False
457
458 # generate the rss feed
459 ret = generate_rss_feed (dbname, conf)
460 if ret is False:
461 return False
462
463 # generate the category directories and indices
464 ret = generate_category_indices (dbname, conf, tpls, cats_str, best_rated_str, cats)
465 if ret is False:
466 return False
467
468 # generate the article pages
469 ret = generate_article_pages (dbname, conf, tpls, cats_str, best_rated_str)
470 if ret is False:
471 return False
472
473 # copy other files/folders into the destination path
474 ret = copy_files_folders (conf, files_to_copy, folders_to_copy)
475 if ret is False:
476 return False
477
478 # now generate the search index database
479 ret = generate_search_index (dbname, conf, search_type_full)
480 if ret is False:
481 return False
482
483 # finally when all is successfully done return true
484 return True