With any given HTML, I need to:
• Remove all tables and their contents
• Remove everything after the first h1
  tag 
• Keep only paragraphs (INCLUDING
  their inner HTML (links, lists, etc))
This can be done very easily with XSLT:
<xsl:stylesheet version="1.0"
 xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
 xmlns:h="http://www.w3.org/1999/xhtml" >
 <xsl:output omit-xml-declaration="yes" indent="yes"/>
 <xsl:strip-space elements="*"/>
 <!-- Copy every node except when overriden
      by another template -->
 <xsl:template match="node()|@*">
  <xsl:copy>
   <xsl:apply-templates select="node()|@*"/>
  </xsl:copy>
 </xsl:template>
 <!-- Remove all tables and their contents -->
 <xsl:template match="h:table"/>
 <!-- Remove everything after the first h1 -->
 <xsl:template match="node()[preceding::h:h1]"/>
 <!-- Keep only paragraphs (INCLUDING
      their inner HTML (links, lists, etc))
  -->
 <xsl:template match=
 "node()[not(self::h:p) and not(ancestor::h:p)]">
  <xsl:apply-templates/>
 </xsl:template>
</xsl:stylesheet>
In case your element names are not in the XHtml namespace, simple delete any occurence of h: in the above code.