主题:[活动]各位大侠。帮忙分析这个程序的设计思想,画出流程图。小弟不胜感激~~~
package hubinjushi;
import java.io.*;
import java.util.Vector;
import org.htmlparser.*;
import org.htmlparser.util.*;
import org.htmlparser.filters.*;
import org.htmlparser.tags.*;
import org.htmlparser.visitors.*;
import org.htmlparser.nodes.*;
public class WebPage {
private String htmlSource;
private String parsedSource;
private String encoding;
private String title;
private String url;
private NodeList keepedNodeList;
private TableColumn keyTd;
private String Content;
public WebPage(String htmlSource) {
this.htmlSource = htmlSource;
}
public String getContent() {
return Content;
}
public void setContent(String content) {
this.Content = content;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getEncoding() {
return encoding;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public String getHtmlSource() {
return htmlSource;
}
public void setHtmlSource(String htmlSource) {
this.htmlSource = htmlSource;
}
public String getParsedSource() {
return parsedSource;
}
public void setParsedSource(String parsedSource) {
this.parsedSource = parsedSource;
}
public NodeList getKeepedNodeList() {
return keepedNodeList;
}
public void setKeepedNodeList(NodeList keepedNodeList) {
this.keepedNodeList = keepedNodeList;
}
private Parser getParser() {
Parser parser = new Parser();
PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); // register
factory.registerTag(new StrongTag()); // <STONG> tag
factory.registerTag(new BoldTag()); // <B> tag
factory.registerTag(new BigTag()); // <BIG> tag
factory.registerTag(new H1Tag()); // <H1> tag
factory.registerTag(new H2Tag()); // <H2> tag
factory.registerTag(new H3Tag());// <H3> tag
factory.registerTag(new FontTag());// <font>tag
parser.setNodeFactory(factory);
return parser;
}
private boolean isKeyTag(Tag tag) {
if (tag == null)
return false;
if (tag instanceof Div)
return true;
if (tag instanceof ParagraphTag)
return true;
if (tag instanceof TitleTag)
return true;
if (tag instanceof BigTag)
return true;
if (tag instanceof BoldTag)
return true;
if (tag instanceof H1Tag)
return true;
if (tag instanceof H2Tag)
return true;
if (tag instanceof H3Tag)
return true;
if (tag instanceof StrongTag)
return true;
return false;
}
private void print(String s) {
if (s == null)
return;
s = s.replaceAll("[\\s]+", "");
s = s.replaceAll(" ", "");
if (s.length() > 0)
System.out.print("*********长度:" + s.length() + "*************");
System.out.println(s);
}
private boolean isNotToProcessNode(Node node) {
if (node == null)
return false;
if (node instanceof LinkTag)
return true;
if (node instanceof FormTag)
return true;
if (node instanceof ScriptTag)
return true;
if (node instanceof SelectTag)
return true;
if (node instanceof StyleTag)
return true;
return false;
}
private void processKeyTag(Node node) throws ParserException {
if (node instanceof TextNode) {
return;
}
if (this.isNotToProcessNode(node)) {
Node parentNode = node.getParent();
if (parentNode instanceof CompositeTag) {
CompositeTag parentTag = (CompositeTag) parentNode;
int index = parentTag.findPositionOf(node);
parentTag.removeChild(index);
}
return;
}
NodeList childList = node.getChildren();
if (childList == null)
return;
for (int i = 0; i < childList.size(); i++) {
Node childNode = childList.elementAt(i);
this.processKeyTag(childNode);
}
}
private boolean isNoise(String s) {
if (s == null)
return true;
s = s.replaceAll("[\\s]+", "");
s = s.replaceAll(" ", "");
if (s.length() < 10)
return true;
return false;
}
private boolean nestedByTdTag(Node node) throws ParserException {
if (node == null)
return false;
Node parentNode = node.getParent();
if (parentNode == null)
return false;
while (!(parentNode instanceof BodyTag)) {
if (parentNode instanceof TableColumn) {
return true;
}
parentNode = parentNode.getParent();
}
return false;
}
private boolean hasTdTag(Node node) throws ParserException {
if (node == null)
return false;
NodeList childList = node.getChildren();
if (childList == null)
return false;
for (int i = 0; i < childList.size(); i++) {
Node childNode = childList.elementAt(i);
if (childNode instanceof TableColumn) {
return true;
}
if (this.hasTdTag(childNode))
return true;
}
return false;
}
private void addTd(TableColumn td, String s) throws ParserException {
s = s.replaceAll("[\\s]+", "");
s = s.replaceAll(" ", "");
if (this.keyTd == null) {
this.keyTd = td;
this.Content = s;
} else {
if ((this.Content).length() < s.length()) {
this.keyTd = td;
this.Content = s;
}
}
}
public void printTdTagString(Node node) throws ParserException {
if (node == null)
return;
NodeList childList = node.getChildren();
if (childList == null)
return;
for (int i = 0; i < childList.size(); i++) {
Node childNode = childList.elementAt(i);
this.printTdTagString(childNode);
if (childNode instanceof TableColumn) {
TableColumn td = (TableColumn) childNode;
StringBuffer sb = new StringBuffer();
for (int j = 0; j < td.getChildCount(); j++) {
Node tdChildNode = td.getChild(j);
if (!this.hasTdTag(tdChildNode)) {
String s = this.getStringOfNode(tdChildNode);
if (s != null)
sb.append(s);
}
}
this.addTd(td, sb.toString());
// this.print(sb.toString());
}
}
}
public String getStringOfNode(Node node) throws ParserException {
if (node == null)
return null;
if (node instanceof TextNode) {
return node.getText();
}
NodeList childList = node.getChildren();
if (childList == null)
return null;
StringBuffer sb = new StringBuffer();
for (int i = 0; i < childList.size(); i++) {
Node childNode = childList.elementAt(i);
String s = this.getStringOfNode(childNode);
if (s != null) {
sb.append(s);
}
}
return sb.toString();
}
public void removeNoise(BodyTag bodyTag) throws ParserException {
if (bodyTag == null)
return;
this.removeNoiseOfTdNode(bodyTag);
}
private void removeNoiseOfTdNode(Node node) throws ParserException {
if (node == null)
return;
NodeList childList = node.getChildren();
if (childList == null)
return;
for (int i = 0; i < childList.size(); i++) {
Node childNode = childList.elementAt(i);
this.removeNoiseOfTdNode(childNode);
if (childNode instanceof TableColumn) {
String tdText = this.getStringOfNode(childNode);
if (isNoise(tdText)) {
if (node instanceof CompositeTag) {
CompositeTag parentTag = (CompositeTag) node;
int index = parentTag.findPositionOf(childNode);
parentTag.removeChild(index);
}
}
}
}
}
public void creatParseTree(BodyTag bodyTag) throws ParserException {
if (bodyTag == null)
return;
NodeList list = bodyTag.getChildren();
for (int i = 0; i < list.size(); i++) {
Node node = list.elementAt(i);
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
String text = textNode.getText();
// this.print(text);
continue;
}
if (this.isNotToProcessNode(node)) {
bodyTag.removeChild(i);
continue;
}
this.processKeyTag(node);
}
}
public String parse() throws ParserException {
if (this.htmlSource == null)
return null;
Parser parser = this.getParser();
parser.setInputHTML(this.htmlSource);
NodeList pageList = parser.parse(null);
this.encoding = parser.getEncoding();
TagNameFilter filter = new TagNameFilter("title");
NodeList titleList = pageList.extractAllNodesThatMatch(filter, true);
if (titleList.size() > 0) {
TitleTag titleNode = (TitleTag) titleList.elementAt(0);
this.title = titleNode.toPlainTextString();
}
NodeList bodyList = pageList.extractAllNodesThatMatch(
new TagNameFilter("body"), true);
if (bodyList.size() < 0)
return null;
BodyTag bodyTag = (BodyTag) bodyList.elementAt(0);
this.creatParseTree(bodyTag);
this.removeNoise(bodyTag);
this.printTdTagString(bodytag);
return null;
}
public static void main(String[] args) {
System.out.println("test is start!");
try {
FileReader fr = new FileReader("filename.htm");
BufferedReader br = new BufferedReader(fr);
StringBuffer sb = new StringBuffer();
String r = br.readLine();
sb.append(r);
while (r != null) {
r = br.readLine();
sb.append(r);
}
fr.close();
br.close();
String source = sb.toString();
WebPage page = new WebPage(source);
page.parse();
System.out.println(page.getContent());
}
catch (Exception e) {
e.printStackTrace();
}
System.out.println("test is over!");
}
}
import java.io.*;
import java.util.Vector;
import org.htmlparser.*;
import org.htmlparser.util.*;
import org.htmlparser.filters.*;
import org.htmlparser.tags.*;
import org.htmlparser.visitors.*;
import org.htmlparser.nodes.*;
public class WebPage {
private String htmlSource;
private String parsedSource;
private String encoding;
private String title;
private String url;
private NodeList keepedNodeList;
private TableColumn keyTd;
private String Content;
public WebPage(String htmlSource) {
this.htmlSource = htmlSource;
}
public String getContent() {
return Content;
}
public void setContent(String content) {
this.Content = content;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getEncoding() {
return encoding;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public String getHtmlSource() {
return htmlSource;
}
public void setHtmlSource(String htmlSource) {
this.htmlSource = htmlSource;
}
public String getParsedSource() {
return parsedSource;
}
public void setParsedSource(String parsedSource) {
this.parsedSource = parsedSource;
}
public NodeList getKeepedNodeList() {
return keepedNodeList;
}
public void setKeepedNodeList(NodeList keepedNodeList) {
this.keepedNodeList = keepedNodeList;
}
private Parser getParser() {
Parser parser = new Parser();
PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); // register
factory.registerTag(new StrongTag()); // <STONG> tag
factory.registerTag(new BoldTag()); // <B> tag
factory.registerTag(new BigTag()); // <BIG> tag
factory.registerTag(new H1Tag()); // <H1> tag
factory.registerTag(new H2Tag()); // <H2> tag
factory.registerTag(new H3Tag());// <H3> tag
factory.registerTag(new FontTag());// <font>tag
parser.setNodeFactory(factory);
return parser;
}
private boolean isKeyTag(Tag tag) {
if (tag == null)
return false;
if (tag instanceof Div)
return true;
if (tag instanceof ParagraphTag)
return true;
if (tag instanceof TitleTag)
return true;
if (tag instanceof BigTag)
return true;
if (tag instanceof BoldTag)
return true;
if (tag instanceof H1Tag)
return true;
if (tag instanceof H2Tag)
return true;
if (tag instanceof H3Tag)
return true;
if (tag instanceof StrongTag)
return true;
return false;
}
private void print(String s) {
if (s == null)
return;
s = s.replaceAll("[\\s]+", "");
s = s.replaceAll(" ", "");
if (s.length() > 0)
System.out.print("*********长度:" + s.length() + "*************");
System.out.println(s);
}
private boolean isNotToProcessNode(Node node) {
if (node == null)
return false;
if (node instanceof LinkTag)
return true;
if (node instanceof FormTag)
return true;
if (node instanceof ScriptTag)
return true;
if (node instanceof SelectTag)
return true;
if (node instanceof StyleTag)
return true;
return false;
}
private void processKeyTag(Node node) throws ParserException {
if (node instanceof TextNode) {
return;
}
if (this.isNotToProcessNode(node)) {
Node parentNode = node.getParent();
if (parentNode instanceof CompositeTag) {
CompositeTag parentTag = (CompositeTag) parentNode;
int index = parentTag.findPositionOf(node);
parentTag.removeChild(index);
}
return;
}
NodeList childList = node.getChildren();
if (childList == null)
return;
for (int i = 0; i < childList.size(); i++) {
Node childNode = childList.elementAt(i);
this.processKeyTag(childNode);
}
}
private boolean isNoise(String s) {
if (s == null)
return true;
s = s.replaceAll("[\\s]+", "");
s = s.replaceAll(" ", "");
if (s.length() < 10)
return true;
return false;
}
private boolean nestedByTdTag(Node node) throws ParserException {
if (node == null)
return false;
Node parentNode = node.getParent();
if (parentNode == null)
return false;
while (!(parentNode instanceof BodyTag)) {
if (parentNode instanceof TableColumn) {
return true;
}
parentNode = parentNode.getParent();
}
return false;
}
private boolean hasTdTag(Node node) throws ParserException {
if (node == null)
return false;
NodeList childList = node.getChildren();
if (childList == null)
return false;
for (int i = 0; i < childList.size(); i++) {
Node childNode = childList.elementAt(i);
if (childNode instanceof TableColumn) {
return true;
}
if (this.hasTdTag(childNode))
return true;
}
return false;
}
private void addTd(TableColumn td, String s) throws ParserException {
s = s.replaceAll("[\\s]+", "");
s = s.replaceAll(" ", "");
if (this.keyTd == null) {
this.keyTd = td;
this.Content = s;
} else {
if ((this.Content).length() < s.length()) {
this.keyTd = td;
this.Content = s;
}
}
}
public void printTdTagString(Node node) throws ParserException {
if (node == null)
return;
NodeList childList = node.getChildren();
if (childList == null)
return;
for (int i = 0; i < childList.size(); i++) {
Node childNode = childList.elementAt(i);
this.printTdTagString(childNode);
if (childNode instanceof TableColumn) {
TableColumn td = (TableColumn) childNode;
StringBuffer sb = new StringBuffer();
for (int j = 0; j < td.getChildCount(); j++) {
Node tdChildNode = td.getChild(j);
if (!this.hasTdTag(tdChildNode)) {
String s = this.getStringOfNode(tdChildNode);
if (s != null)
sb.append(s);
}
}
this.addTd(td, sb.toString());
// this.print(sb.toString());
}
}
}
public String getStringOfNode(Node node) throws ParserException {
if (node == null)
return null;
if (node instanceof TextNode) {
return node.getText();
}
NodeList childList = node.getChildren();
if (childList == null)
return null;
StringBuffer sb = new StringBuffer();
for (int i = 0; i < childList.size(); i++) {
Node childNode = childList.elementAt(i);
String s = this.getStringOfNode(childNode);
if (s != null) {
sb.append(s);
}
}
return sb.toString();
}
public void removeNoise(BodyTag bodyTag) throws ParserException {
if (bodyTag == null)
return;
this.removeNoiseOfTdNode(bodyTag);
}
private void removeNoiseOfTdNode(Node node) throws ParserException {
if (node == null)
return;
NodeList childList = node.getChildren();
if (childList == null)
return;
for (int i = 0; i < childList.size(); i++) {
Node childNode = childList.elementAt(i);
this.removeNoiseOfTdNode(childNode);
if (childNode instanceof TableColumn) {
String tdText = this.getStringOfNode(childNode);
if (isNoise(tdText)) {
if (node instanceof CompositeTag) {
CompositeTag parentTag = (CompositeTag) node;
int index = parentTag.findPositionOf(childNode);
parentTag.removeChild(index);
}
}
}
}
}
public void creatParseTree(BodyTag bodyTag) throws ParserException {
if (bodyTag == null)
return;
NodeList list = bodyTag.getChildren();
for (int i = 0; i < list.size(); i++) {
Node node = list.elementAt(i);
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
String text = textNode.getText();
// this.print(text);
continue;
}
if (this.isNotToProcessNode(node)) {
bodyTag.removeChild(i);
continue;
}
this.processKeyTag(node);
}
}
public String parse() throws ParserException {
if (this.htmlSource == null)
return null;
Parser parser = this.getParser();
parser.setInputHTML(this.htmlSource);
NodeList pageList = parser.parse(null);
this.encoding = parser.getEncoding();
TagNameFilter filter = new TagNameFilter("title");
NodeList titleList = pageList.extractAllNodesThatMatch(filter, true);
if (titleList.size() > 0) {
TitleTag titleNode = (TitleTag) titleList.elementAt(0);
this.title = titleNode.toPlainTextString();
}
NodeList bodyList = pageList.extractAllNodesThatMatch(
new TagNameFilter("body"), true);
if (bodyList.size() < 0)
return null;
BodyTag bodyTag = (BodyTag) bodyList.elementAt(0);
this.creatParseTree(bodyTag);
this.removeNoise(bodyTag);
this.printTdTagString(bodytag);
return null;
}
public static void main(String[] args) {
System.out.println("test is start!");
try {
FileReader fr = new FileReader("filename.htm");
BufferedReader br = new BufferedReader(fr);
StringBuffer sb = new StringBuffer();
String r = br.readLine();
sb.append(r);
while (r != null) {
r = br.readLine();
sb.append(r);
}
fr.close();
br.close();
String source = sb.toString();
WebPage page = new WebPage(source);
page.parse();
System.out.println(page.getContent());
}
catch (Exception e) {
e.printStackTrace();
}
System.out.println("test is over!");
}
}