Consider:
/**
* Returns the style for a node.
*
* @param n The node to check.
* @param p The property to retrieve (usually 'display').
* @link http://www.quirksmode.org/dom/getstyles.html
*/
this.getStyle = function( n, p ) {
return n.currentStyle ?
n.currentStyle[p] :
document.defaultView.getComputedStyle(n, null).getPropertyValue(p);
}
/**
* Converts HTML to text, preserving semantic newlines for block-level
* elements.
*
* @param node - The HTML node to perform text extraction.
*/
this.toText = function( node ) {
var result="";
if( node.nodeType == document.TEXT_NODE ) {
// Replace repeated spaces, newlines, and tabs with a single space.
result = node.nodeValue.replace( /\s+/g, ' ' );
}
else {
for( var i = 0, j = node.childNodes.length; i < j; i++ ) {
result += _this.toText( node.childNodes[i] );
}
var d = _this.getStyle( node, 'display' );
if( d.match( /^block/ ) || d.match( /list/ ) || d.match( /row/ ) ||
node.tagName == 'BR' || node.tagName == 'HR' ) {
result += '\n';
}
}
return result;
}
http://jsfiddle.net/3mzrV/2/
That is to say, with an exception or two, iterate through each node and print its contents, letting the browser’s computed style tell you when to insert newlines.