/* * $Id: tocgen.bsh,v 1.3 2004/01/30 06:46:33 mitch Exp $ * * OSI Certified Open Source Software (see www.opensource.org for details) * Licensed under the BSD license: * * Copyright (c) 2004, FullSpan Software (www.fullspan.com) * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * - Neither the name of FullSpan Software nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ // Run with beanshell (www.beanshell.org): // // java bsh.Interpreter tocgen.bsh INFILE OUTFILE STYLESHEET // // See tocgen.html for detailed usage information. import java.io.*; import java.util.*; import java.util.regex.*; import javax.xml.transform.*; import javax.xml.transform.stream.*; import org.jdom.*; import org.jdom.filter.*; import org.jdom.input.*; import org.jdom.output.*; import org.jdom.transform.*; import org.jdom.xpath.*; // Constants final String[] usageLines = { "", "TOCGen v1.0", "Usage:", "", " java bsh.Interpreter tocgen.bsh INFILE OUTFILE STYLESHEET", "", "Adds a Table of Contents to an XHTML document. See tocgen.html for", "detailed usage information.", "" }; final boolean trace = false; // Because XHTML input files have a namespace, we need to qualify our // XPath expressions. We just use an arbitrary "x" as the namespace prefix. // See: http://www.servlets.com/archive/servlet/ReadMsg?msgId=425772&listName=jdom-interest final Namespace NS = Namespace.getNamespace("x", "http://www.w3.org/1999/xhtml"); final int MAX_HEADING_LEVEL = 6; final String ANCHOR_PREFIX = "toc-"; // Pattern for heading number // Optional whitespace // Sequence of numbers and dots // Optional whitespace final Pattern HEADING_PATTERN = Pattern.compile("^\\s*[\\d\\.]+\\s*"); final Matcher HEADING_MATCHER = HEADING_PATTERN.matcher(""); final Filter TEXT_AND_ELEMENT_FILTER = new ContentFilter(ContentFilter.TEXT | ContentFilter.ELEMENT); // Objects (closures == pseudo-classes - see BeanShell User's Guide) Heading(elem) { m_elem = elem; m_level = getHeadingLevel(m_elem.getName()); m_headingNumberText = ""; m_anchorId = ""; m_contentText = ""; void setHeadingNumber(headingNumbers, firstLevel) { m_headingNumberText = getHeadingNumberText(headingNumbers, firstLevel); m_anchorId = ANCHOR_PREFIX + m_headingNumberText; } void setHeadingText() { textNode = getFirstTextNode(m_elem); if (textNode == null) { throw new RuntimeException("Missing text for heading: " + m_elem); } text = textNode.getTextTrim(); HEADING_MATCHER.reset(text); text = HEADING_MATCHER.replaceFirst(""); text = m_headingNumberText + " " + text; textNode.setText(text); // Get the text for the table of contents for this heading. // If we just use m_elem.getTextTrim() this will only give us the // direct text. // // For example, if we have a heading like this: // //

This is an important chapter

// // this would give us the contents entry: "This is an chapter", // because the italicized text is in a child element. // So we need to extract the text recursively from all child // elements and use that for the contents entry. In the // above example this would give the contents entry: // "This is an important chapter". m_contentText = getAllText(m_elem); } return this; } // Utility methods XPath getXPath(path) { xp = XPath.newInstance(path); xp.addNamespace(NS); return xp; } /** * Return true if the node is an Element with name h1...h6, * otherwise return false */ boolean isHeading(node) { if (!(node instanceof Element)) { return false; } name = node.getName(); if (name != null && name.length() == 2 && name.startsWith("h") && name.charAt(1) >= '1' && name.charAt(1) <= '6') { return true; } return false; } /** * return 1, 2, ... for elements "h1", "h2", ... */ int getHeadingLevel(elemName) { return elemName.charAt(1) - '0'; } /** * Return the heading number as a dotted string, e.g. "2.1.4" */ String getHeadingNumberText(headingNumbers, firstLevel) { StringBuffer buf = new StringBuffer(); for (i = firstLevel; i < headingNumbers.length; i++) { if (headingNumbers[i] == 0) { break; } if (i > firstLevel) { buf.append("."); } buf.append(headingNumbers[i]); } return buf.toString(); } /** * Return the first child node of type Text, searching recursively * from the given node. Search is depth-first. */ Text getFirstTextNode(node) { if (node instanceof Text) { return node; } // node instanceof Element content = node.getContent(TEXT_AND_ELEMENT_FILTER); for (contentNode: content) { result = getFirstTextNode(contentNode); if (result != null) { return result; } } return null; } /** * Return the text of all nodes at and below the given node, searching * recursively. Search is depth-first. */ String getAllText(node) { textList = new ArrayList(); getAllText(node, textList); buf = new StringBuffer(); for (text: textList) { buf.append(text); buf.append(" "); } return buf.toString(); } /** * Find the text of all nodes at and below the given node, searching * recursively. Search is depth-first. Append the text found to * the specified textList. */ void getAllText(node, textList) { if (node instanceof Text) { textList.add(node.getTextTrim()); } else // node instanceof Element { content = node.getContent(TEXT_AND_ELEMENT_FILTER); for (contentNode: content) { getAllText(contentNode, textList); } } } // ========== Begin main body // Process command line if (bsh.args.length != 3) { for (usageLine: usageLines) { print(usageLine); } exit(); } inFilename = bsh.args[0]; inFile = new File(inFilename); if (!inFile.exists()) { print("Input file does not exist: " + inFilename); exit(); } outFilename = bsh.args[1]; outFile = new File(outFilename); stylesheetFilename = bsh.args[2]; stylesheetFile = new File(stylesheetFilename); if (!stylesheetFile.exists()) { print("Stylesheet file does not exist: " + stylesheetFilename); exit(); } // Load the input file saxBuilder = new SAXBuilder(); jdomDoc = saxBuilder.build(inFile); rootElem = jdomDoc.getRootElement(); // Find where we will insert the toc tocElem = getXPath("//x:div[@id='toc']").selectSingleNode(rootElem); if (tocElem == null) { print("Missing toc placeholder. Expected:
"); exit(); } // Get all of the headings bodyElem = getXPath("x:body").selectSingleNode(rootElem); allNodes = getXPath("//x:*").selectNodes(bodyElem); headings = new ArrayList(); for (node: allNodes) { if (isHeading(node)) { headings.add(Heading(node)); } } if (headings.size() == 0) { print("There are no headings in the source document."); exit(); } if (trace) { for (heading: headings) { print(heading.m_elem.getName() + ": " + heading.m_elem.getTextTrim()); } } // The headingNumbers array keeps track of the current heading numbers. // Index [0] is unused, indexes [1] up to and including [6] represent // the current heading number for h1...h6. int[] headingNumbers = new int[MAX_HEADING_LEVEL + 1]; headingNumbers[0] = -1; for (i = 1; i <= MAX_HEADING_LEVEL; i++) { headingNumbers[i] = 0; } firstLevel = 0; for (heading: headings) { if (firstLevel == 0) { firstLevel = heading.m_level; } else if (heading.m_level < firstLevel) { print("Error: the first heading in the document must have the " + "lowest hN number, this heading is not valid: " + heading.m_elem.getName() + ": " + heading.m_elem.getTextTrim()); exit(); } headingNumbers[heading.m_level]++; for (i = firstLevel; i < heading.m_level; i++) { if (headingNumbers[i] <= 0) { print("Error: gap in heading levels. This heading is more than " + "one level deeper than any previous heading: " + heading.m_elem.getName() + ": " + heading.m_elem.getTextTrim()); exit(); } } // When a more prominent heading (with a lower hN number) is found, // reset the numbers for less prominent headings (those with // higher hN numbers). for (i = heading.m_level + 1; i <= MAX_HEADING_LEVEL; i++) { headingNumbers[i] = 0; } heading.setHeadingNumber(headingNumbers, firstLevel); heading.setHeadingText(); if (trace) { print(heading.m_headingNumberText + ": " + heading.m_elem.getTextTrim()); } } // Create an in-memory XML document, tocDataDoc, to hold the table of // contents data. For example: // // // 1 A Heading // 1.1 Another Heading // tocDataRoot = new Element("toc"); tocDataDoc = new Document(tocDataRoot); for (heading: headings) { // Remove any existing toc-* anchors (this allows the file to be // processed multiple times without creating duplicate tags). anchors = getXPath("x:a[@id]").selectNodes(heading.m_elem); for (anchor: anchors) { if (anchor.getAttributeValue("id").startsWith(ANCHOR_PREFIX)) { anchor.detach(); } } // Add one new anchor tag to each heading. anchor = new Element("a", heading.m_elem.getNamespace()). setAttribute("id", heading.m_anchorId); heading.m_elem.addContent(anchor); // Add this heading to the TOC data document tocDataElem = new Element("entry"). setText(heading.m_contentText). setAttribute("refid", heading.m_anchorId); tocDataElem.setAttribute("level", String.valueOf(heading.m_level - firstLevel)); tocDataRoot.addContent(tocDataElem); } // Create TOC display by merging TOC data with stylesheet transformer = TransformerFactory.newInstance(). newTransformer(new StreamSource(stylesheetFile)); JDOMResult tocResult = new JDOMResult(); transformer.transform(new JDOMSource(tocDataDoc), tocResult); // Delete all the current content in the main doc TOC (if any) tocElem.getContent().clear(); // Merge transformed doc into main doc for (resultNode: tocResult.getResult()) { if (resultNode instanceof Element) { resultNode.setNamespace(tocElem.getNamespace()); } tocElem.addContent(resultNode); } // Output the merged doc outStream = new FileOutputStream(outFile); XMLOutputter outputter = new XMLOutputter(); outputter.setEncoding("UTF-8"); outputter.setTextTrim(true); outputter.setIndent(" "); outputter.setNewlines(true); outputter.output(jdomDoc, outStream); outStream.close();