/*
* $Id: tocgen.bsh,v 1.3 2004/01/30 06:46:33 mitch Exp $
*
* OSI Certified Open Source Software (see www.opensource.org for details)
* Licensed under the BSD license:
*
* Copyright (c) 2004, FullSpan Software (www.fullspan.com)
* All rights reserved
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* - Neither the name of FullSpan Software nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
// Run with beanshell (www.beanshell.org):
//
// java bsh.Interpreter tocgen.bsh INFILE OUTFILE STYLESHEET
//
// See tocgen.html for detailed usage information.
import java.io.*;
import java.util.*;
import java.util.regex.*;
import javax.xml.transform.*;
import javax.xml.transform.stream.*;
import org.jdom.*;
import org.jdom.filter.*;
import org.jdom.input.*;
import org.jdom.output.*;
import org.jdom.transform.*;
import org.jdom.xpath.*;
// Constants
final String[] usageLines =
{
"",
"TOCGen v1.0",
"Usage:",
"",
" java bsh.Interpreter tocgen.bsh INFILE OUTFILE STYLESHEET",
"",
"Adds a Table of Contents to an XHTML document. See tocgen.html for",
"detailed usage information.",
""
};
final boolean trace = false;
// Because XHTML input files have a namespace, we need to qualify our
// XPath expressions. We just use an arbitrary "x" as the namespace prefix.
// See: http://www.servlets.com/archive/servlet/ReadMsg?msgId=425772&listName=jdom-interest
final Namespace NS = Namespace.getNamespace("x", "http://www.w3.org/1999/xhtml");
final int MAX_HEADING_LEVEL = 6;
final String ANCHOR_PREFIX = "toc-";
// Pattern for heading number
// Optional whitespace
// Sequence of numbers and dots
// Optional whitespace
final Pattern HEADING_PATTERN =
Pattern.compile("^\\s*[\\d\\.]+\\s*");
final Matcher HEADING_MATCHER = HEADING_PATTERN.matcher("");
final Filter TEXT_AND_ELEMENT_FILTER =
new ContentFilter(ContentFilter.TEXT | ContentFilter.ELEMENT);
// Objects (closures == pseudo-classes - see BeanShell User's Guide)
Heading(elem)
{
m_elem = elem;
m_level = getHeadingLevel(m_elem.getName());
m_headingNumberText = "";
m_anchorId = "";
m_contentText = "";
void setHeadingNumber(headingNumbers, firstLevel)
{
m_headingNumberText = getHeadingNumberText(headingNumbers, firstLevel);
m_anchorId = ANCHOR_PREFIX + m_headingNumberText;
}
void setHeadingText()
{
textNode = getFirstTextNode(m_elem);
if (textNode == null)
{
throw new RuntimeException("Missing text for heading: " + m_elem);
}
text = textNode.getTextTrim();
HEADING_MATCHER.reset(text);
text = HEADING_MATCHER.replaceFirst("");
text = m_headingNumberText + " " + text;
textNode.setText(text);
// Get the text for the table of contents for this heading.
// If we just use m_elem.getTextTrim() this will only give us the
// direct text.
//
// For example, if we have a heading like this:
//
//
This is an important chapter
//
// this would give us the contents entry: "This is an chapter",
// because the italicized text is in a child element.
// So we need to extract the text recursively from all child
// elements and use that for the contents entry. In the
// above example this would give the contents entry:
// "This is an important chapter".
m_contentText = getAllText(m_elem);
}
return this;
}
// Utility methods
XPath getXPath(path)
{
xp = XPath.newInstance(path);
xp.addNamespace(NS);
return xp;
}
/**
* Return true if the node is an Element with name h1...h6,
* otherwise return false
*/
boolean isHeading(node)
{
if (!(node instanceof Element))
{
return false;
}
name = node.getName();
if (name != null && name.length() == 2 && name.startsWith("h") &&
name.charAt(1) >= '1' && name.charAt(1) <= '6')
{
return true;
}
return false;
}
/**
* return 1, 2, ... for elements "h1", "h2", ...
*/
int getHeadingLevel(elemName)
{
return elemName.charAt(1) - '0';
}
/**
* Return the heading number as a dotted string, e.g. "2.1.4"
*/
String getHeadingNumberText(headingNumbers, firstLevel)
{
StringBuffer buf = new StringBuffer();
for (i = firstLevel; i < headingNumbers.length; i++)
{
if (headingNumbers[i] == 0)
{
break;
}
if (i > firstLevel)
{
buf.append(".");
}
buf.append(headingNumbers[i]);
}
return buf.toString();
}
/**
* Return the first child node of type Text, searching recursively
* from the given node. Search is depth-first.
*/
Text getFirstTextNode(node)
{
if (node instanceof Text)
{
return node;
}
// node instanceof Element
content = node.getContent(TEXT_AND_ELEMENT_FILTER);
for (contentNode: content)
{
result = getFirstTextNode(contentNode);
if (result != null)
{
return result;
}
}
return null;
}
/**
* Return the text of all nodes at and below the given node, searching
* recursively. Search is depth-first.
*/
String getAllText(node)
{
textList = new ArrayList();
getAllText(node, textList);
buf = new StringBuffer();
for (text: textList)
{
buf.append(text);
buf.append(" ");
}
return buf.toString();
}
/**
* Find the text of all nodes at and below the given node, searching
* recursively. Search is depth-first. Append the text found to
* the specified textList.
*/
void getAllText(node, textList)
{
if (node instanceof Text)
{
textList.add(node.getTextTrim());
}
else // node instanceof Element
{
content = node.getContent(TEXT_AND_ELEMENT_FILTER);
for (contentNode: content)
{
getAllText(contentNode, textList);
}
}
}
// ========== Begin main body
// Process command line
if (bsh.args.length != 3)
{
for (usageLine: usageLines)
{
print(usageLine);
}
exit();
}
inFilename = bsh.args[0];
inFile = new File(inFilename);
if (!inFile.exists())
{
print("Input file does not exist: " + inFilename);
exit();
}
outFilename = bsh.args[1];
outFile = new File(outFilename);
stylesheetFilename = bsh.args[2];
stylesheetFile = new File(stylesheetFilename);
if (!stylesheetFile.exists())
{
print("Stylesheet file does not exist: " + stylesheetFilename);
exit();
}
// Load the input file
saxBuilder = new SAXBuilder();
jdomDoc = saxBuilder.build(inFile);
rootElem = jdomDoc.getRootElement();
// Find where we will insert the toc
tocElem = getXPath("//x:div[@id='toc']").selectSingleNode(rootElem);
if (tocElem == null)
{
print("Missing toc placeholder. Expected: ");
exit();
}
// Get all of the headings
bodyElem = getXPath("x:body").selectSingleNode(rootElem);
allNodes = getXPath("//x:*").selectNodes(bodyElem);
headings = new ArrayList();
for (node: allNodes)
{
if (isHeading(node))
{
headings.add(Heading(node));
}
}
if (headings.size() == 0)
{
print("There are no headings in the source document.");
exit();
}
if (trace)
{
for (heading: headings)
{
print(heading.m_elem.getName() + ": " + heading.m_elem.getTextTrim());
}
}
// The headingNumbers array keeps track of the current heading numbers.
// Index [0] is unused, indexes [1] up to and including [6] represent
// the current heading number for h1...h6.
int[] headingNumbers = new int[MAX_HEADING_LEVEL + 1];
headingNumbers[0] = -1;
for (i = 1; i <= MAX_HEADING_LEVEL; i++)
{
headingNumbers[i] = 0;
}
firstLevel = 0;
for (heading: headings)
{
if (firstLevel == 0)
{
firstLevel = heading.m_level;
}
else if (heading.m_level < firstLevel)
{
print("Error: the first heading in the document must have the " +
"lowest hN number, this heading is not valid: " +
heading.m_elem.getName() + ": " + heading.m_elem.getTextTrim());
exit();
}
headingNumbers[heading.m_level]++;
for (i = firstLevel; i < heading.m_level; i++)
{
if (headingNumbers[i] <= 0)
{
print("Error: gap in heading levels. This heading is more than " +
"one level deeper than any previous heading: " +
heading.m_elem.getName() + ": " + heading.m_elem.getTextTrim());
exit();
}
}
// When a more prominent heading (with a lower hN number) is found,
// reset the numbers for less prominent headings (those with
// higher hN numbers).
for (i = heading.m_level + 1; i <= MAX_HEADING_LEVEL; i++)
{
headingNumbers[i] = 0;
}
heading.setHeadingNumber(headingNumbers, firstLevel);
heading.setHeadingText();
if (trace)
{
print(heading.m_headingNumberText + ": " + heading.m_elem.getTextTrim());
}
}
// Create an in-memory XML document, tocDataDoc, to hold the table of
// contents data. For example:
//
//
// 1 A Heading
// 1.1 Another Heading
//
tocDataRoot = new Element("toc");
tocDataDoc = new Document(tocDataRoot);
for (heading: headings)
{
// Remove any existing toc-* anchors (this allows the file to be
// processed multiple times without creating duplicate tags).
anchors = getXPath("x:a[@id]").selectNodes(heading.m_elem);
for (anchor: anchors)
{
if (anchor.getAttributeValue("id").startsWith(ANCHOR_PREFIX))
{
anchor.detach();
}
}
// Add one new anchor tag to each heading.
anchor = new Element("a", heading.m_elem.getNamespace()).
setAttribute("id", heading.m_anchorId);
heading.m_elem.addContent(anchor);
// Add this heading to the TOC data document
tocDataElem = new Element("entry").
setText(heading.m_contentText).
setAttribute("refid", heading.m_anchorId);
tocDataElem.setAttribute("level",
String.valueOf(heading.m_level - firstLevel));
tocDataRoot.addContent(tocDataElem);
}
// Create TOC display by merging TOC data with stylesheet
transformer = TransformerFactory.newInstance().
newTransformer(new StreamSource(stylesheetFile));
JDOMResult tocResult = new JDOMResult();
transformer.transform(new JDOMSource(tocDataDoc), tocResult);
// Delete all the current content in the main doc TOC (if any)
tocElem.getContent().clear();
// Merge transformed doc into main doc
for (resultNode: tocResult.getResult())
{
if (resultNode instanceof Element)
{
resultNode.setNamespace(tocElem.getNamespace());
}
tocElem.addContent(resultNode);
}
// Output the merged doc
outStream = new FileOutputStream(outFile);
XMLOutputter outputter = new XMLOutputter();
outputter.setEncoding("UTF-8");
outputter.setTextTrim(true);
outputter.setIndent(" ");
outputter.setNewlines(true);
outputter.output(jdomDoc, outStream);
outStream.close();