#!/usr/bin/env python
# Copyright 2011 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
'''This utility cleans up the html files as emitted by doxygen so
that they are suitable for publication on a Google documentation site.
'''
import argparse
import glob
import os
import re
import shutil
import sys
try:
from BeautifulSoup import BeautifulSoup, Tag
except (ImportError, NotImplementedError):
print ("This tool requires the BeautifulSoup package "
"(see http://www.crummy.com/software/BeautifulSoup/).\n"
"Make sure that the file BeautifulSoup.py is either in this directory "
"or is available in your PYTHON_PATH")
raise
def Trace(msg):
if Trace.verbose:
sys.stderr.write(str(msg) + '\n')
Trace.verbose = False
FILES_TO_REMOVE = [
'*.css',
'*.map',
'*.md5',
'annotated.html',
'bc_s.png',
'classes.html',
'closed.png',
'doxygen.png',
'files.html',
'functions*.html',
'globals_0x*.html',
'globals_enum.html',
'globals_eval.html',
'globals_func.html',
'globals.html',
'globals_type.html',
'globals_vars.html',
'graph_legend.html',
'graph_legend.png',
'hierarchy.html',
'index_8dox.html',
'index.html',
'modules.html',
'namespacemembers_func.html',
'namespacemembers.html',
'namespaces.html',
'nav_f.png',
'nav_h.png',
'open.png',
'tab_a.png',
'tab_b.png',
'tab_h.png',
'tab_s.png',
]
class HTMLFixer(object):
'''This class cleans up the html strings as produced by Doxygen
'''
def __init__(self, html):
self.soup = BeautifulSoup(html)
def FixTableHeadings(self):
'''Fixes the doxygen table headings.
This includes:
- Using bare <h2> title row instead of row embedded in <tr><td> in table
- Putting the "name" attribute into the "id" attribute of the <tr> tag.
- Splitting up tables into multiple separate tables if a table
heading appears in the middle of a table.
For example, this html:
<table>
<tr><td colspan="2"><h2><a name="pub-attribs"></a>
Data Fields List</h2></td></tr>
...
</table>
would be converted to this:
<h2>Data Fields List</h2>
<table>
...
</table>
'''
table_headers = []
for tag in self.soup.findAll('tr'):
if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']:
#tag['id'] = tag.td.h2.a['name']
tag.string = tag.td.h2.a.next
tag.name = 'h2'
table_headers.append(tag)
# reverse the list so that earlier tags don't delete later tags
table_headers.reverse()
# Split up tables that have multiple table header (th) rows
for tag in table_headers:
Trace("Header tag: %s is %s" % (tag.name, tag.string.strip()))
# Is this a heading in the middle of a table?
if tag.findPreviousSibling('tr') and tag.parent.name == 'table':
Trace("Splitting Table named %s" % tag.string.strip())
table = tag.parent
table_parent = table.parent
table_index = table_parent.contents.index(table)
new_table = Tag(self.soup, name='table', attrs=table.attrs)
table_parent.insert(table_index + 1, new_table)
tag_index = table.contents.index(tag)
for index, row in enumerate(table.contents[tag_index:]):
new_table.insert(index, row)
# Now move the <h2> tag to be in front of the <table> tag
assert tag.parent.name == 'table'
table = tag.parent
table_parent = table.parent
table_index = table_parent.contents.index(table)
table_parent.insert(table_index, tag)
def RemoveTopHeadings(self):
'''Removes <div> sections with a header, tabs, or navpath class attribute'''
header_tags = self.soup.findAll(
name='div',
attrs={'class' : re.compile('^(header|tabs[0-9]*|navpath)$')})
[tag.extract() for tag in header_tags]
def RemoveVersionNumbers(self, html):
'''Horrible hack to strip _#_# from struct names.'''
return re.sub(r'(_\d_\d)(?=[": <])', '', html)
def FixAll(self):
self.FixTableHeadings()
self.RemoveTopHeadings()
html = str(self.soup)
html = self.RemoveVersionNumbers(html)
return html
def main(args):
"""Main entry for the doxy_cleanup utility
doxy_cleanup cleans up the html files generated by doxygen.
"""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('-v', '--verbose', help='verbose output.',
action='store_true')
parser.add_argument('directory')
options = parser.parse_args(args)
if options.verbose:
Trace.verbose = True
root_dir = options.directory
html_dir = os.path.join(root_dir, 'html')
# Doxygen puts all files in an 'html' directory.
# First, move all files from that directory to root_dir.
for filename in glob.glob(os.path.join(html_dir, '*')):
Trace('Moving %s -> %s' % (filename, root_dir))
shutil.move(filename, root_dir)
# Now remove the 'html' directory.
Trace('Removing %s' % html_dir)
os.rmdir(html_dir)
# Then remove unneeded files.
for wildcard in FILES_TO_REMOVE:
Trace('Removing "%s":' % wildcard)
path = os.path.join(root_dir, wildcard)
for filename in glob.glob(path):
Trace(' Removing "%s"' % filename)
os.remove(filename)
# Now, fix the HTML files we've kept.
Trace('Fixing HTML files...')
for root, _, files in os.walk(root_dir):
for filename in files:
if not os.path.splitext(filename)[1] == '.html':
Trace('Skipping %s' % filename)
continue
filename = os.path.join(root, filename)
Trace('Processing "%s"...' % filename)
try:
with open(filename) as f:
html = f.read()
fixer = HTMLFixer(html)
output = fixer.FixAll()
with open(filename, 'w') as f:
f.write(output)
except:
sys.stderr.write("Error while processing %s\n" % filename)
raise
return 0
if __name__ == '__main__':
try:
rtn = main(sys.argv[1:])
except KeyboardInterrupt:
sys.stderr.write('%s: interrupted\n' % os.path.basename(__file__))
rtn = 1
sys.exit(rtn)