为了支持全文检索,有必要将HTML格式的文章转化为纯文本格式,因此我设计了一个基本的WebFormatter类,提供一个简单的public static String html2text(String html),将HTML格式转化为Text:

/*

* File: WebFormatter.java

* Created on 2005-6-24

* Author: Liao Xuefeng,

asklxf@163.com

* Copyright (C) 2005, Liao Xuefeng.

*/

package com.mboker.blog.web.util;

import java.util.*;

import java.text.SimpleDateFormat;

/**

* Do some format on web display.

*

* @author Xuefeng

*/

public class WebFormatter {

public static String html2text(String html) {

StringBuffer sb = new StringBuffer(html.length());

char[] data = html.toCharArray();

int start = 0;

boolean previousIsPre = false;

Token token = null;

for(;;) {

token = parse(data, start, previousIsPre);

if(token==null)

break;

previousIsPre = token.isPreTag();

sb = sb.append(token.getText());

start += token.getLength();

}

return sb.toString();

}

private static Token parse(char[] data, int start, boolean previousIsPre) {

if(start>=data.length)

return null;

// try to read next char:

char c = data[start];

if(c=='

// this is a tag or comment or script:

int end_index = indexOf(data, start+1, '>');

if(end_index==(-1)) {

// the left is all text!

return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);

}

String s = new String(data, start, end_index-start+1);

// now we got s="<...>":

if(s.startsWith("");

if(end_comment_index==(-1)) {

// illegal end, but treat as comment:

return new Token(Token.TOKEN_COMMENT, data, start, data.length, previousIsPre);

}

else

return new Token(Token.TOKEN_COMMENT, data, start, end_comment_index+3, previousIsPre);

}

String s_lowerCase = s.toLowerCase();

if(s_lowerCase.startsWith("");

if(end_script_index==(-1))

// illegal end, but treat as script:

return new Token(Token.TOKEN_SCRIPT, data, start, data.length, previousIsPre);

else

return new Token(Token.TOKEN_SCRIPT, data, start, end_script_index+9, previousIsPre);

}

else { // this is a tag:

return new Token(Token.TOKEN_TAG, data, start, start+s.length(), previousIsPre);

}

}

// this is a text:

int next_tag_index = indexOf(data, start+1, '

if(next_tag_index==(-1))

return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);

return new Token(Token.TOKEN_TEXT, data, start, next_tag_index, previousIsPre);

}

private static int indexOf(char[] data, int start, String s) {

char[] ss = s.toCharArray();

// TODO: performance can improve!

for(int i=start; i

// compare from data[i] with ss[0]:

boolean match = true;

for(int j=0; j

if(data[i+j]!=ss[j]) {

match = false;

break;

}

}

if(match)

return i;

}

return (-1);

}

private static int indexOf(char[] data, int start, char c) {

for(int i=start; i

if(data[i]==c)

return i;

}

return (-1);

}

}

class Token {

public static final int TOKEN_TEXT    = 0; // html text.

public static final int TOKEN_COMMENT = 1; // comment like

public static final int TOKEN_TAG     = 2; // tag like

, 

public static final int TOKEN_SCRIPT  = 3;

private static final char[] TAG_BR  = "

private static final char[] TAG_P   = "

private static final char[] TAG_LI  = "

private static final char[] TAG_PRE = "

private static final char[] TAG_HR  = "


private static final char[] END_TAG_TD = "".toCharArray();

private static final char[] END_TAG_TR = "".toCharArray();

private static final char[] END_TAG_LI = "

".toCharArray();

private static final Map SPECIAL_CHARS = new HashMap();

private int type;

private String html;           // original html

private String text = null;    // text!

private int length = 0;        // html length

private boolean isPre = false; // isPre tag?

static {

SPECIAL_CHARS.put(""", "\"");

SPECIAL_CHARS.put("<",   "

SPECIAL_CHARS.put(">",   ">");

SPECIAL_CHARS.put("&",  "&");

SPECIAL_CHARS.put("®",  "(r)");

SPECIAL_CHARS.put("©", "(c)");

SPECIAL_CHARS.put(" ", " ");

SPECIAL_CHARS.put("£", "?");

}

public Token(int type, char[] data, int start, int end, boolean previousIsPre) {

this.type = type;

this.length = end - start;

this.html = new String(data, start, length);

System.out.println("[Token] html=" + html + ".");

parseText(previousIsPre);

System.out.println("[Token] text=" + text + ".");

}

public int getLength() {

return length;

}

public boolean isPreTag() {

return isPre;

}

private void parseText(boolean previousIsPre) {

if(type==TOKEN_TAG) {

char[] cs = html.toCharArray();

if(compareTag(TAG_BR, cs) || compareTag(TAG_P, cs))

text = "\n";

else if(compareTag(TAG_LI, cs))

text = "\n* ";

else if(compareTag(TAG_PRE, cs))

isPre = true;

else if(compareTag(TAG_HR, cs))

text = "\n--------\n";

else if(compareString(END_TAG_TD, cs))

text = "\t";

else if(compareString(END_TAG_TR, cs) || compareString(END_TAG_LI, cs))

text = "\n";

}

// text token:

else if(type==TOKEN_TEXT) {

text = toText(html, previousIsPre);

}

}

public String getText() {

return text==null ? "" : text;

}

private String toText(String html, final boolean isPre) {

char[] cs = html.toCharArray();

StringBuffer buffer = new StringBuffer(cs.length);

int start = 0;

boolean continueSpace = false;

char current, next;

for(;;) {

if(start>=cs.length)

break;

current = cs[start]; // read current char

if(start+1

next = cs[start+1];

else

next = '\0';

if(current==' ') {

if(isPre || !continueSpace)

buffer = buffer.append(' ');

continueSpace = true;

// continue loop:

start++;

continue;

}

// not ' ', so:

if(current=='\r' && next=='\n') {

if(isPre)

buffer = buffer.append('\n');

// continue loop:

start+=2;

continue;

}

if(current=='\n' || current=='\r') {

if(isPre)

buffer = buffer.append('\n');

// continue loop:

start++;

continue;

}

// cannot continue space:

continueSpace = false;

if(current=='&') {

// maybe special char:

int length = readUtil(cs, start, ';', 10);

if(length==(-1)) { // just '&':

buffer = buffer.append('&');

// continue loop:

start++;

continue;

}

else { // check if special character:

String spec = new String(cs, start, length);

String specChar = (String)SPECIAL_CHARS.get(spec);

if(specChar!=null) { // special chars!

buffer = buffer.append(specChar);

// continue loop:

start+=length;

continue;

}

else { // check if like 'Ӓ':

if(next=='#') { // maybe a char

String num = new String(cs, start+2, length-3);

try {

int code = Integer.parseInt(num);

if(code>0 && code<65536) { // this is a special char:

buffer = buffer.append((char)code);

// continue loop:

start++;

continue;

}

}

catch(Exception e) {}

// just normal char:

buffer = buffer.append("");

// continue loop:

start+=2;

continue;

}

else { // just '&':

buffer = buffer.append('&');

// continue loop:

start++;

continue;

}

}

}

}

else { // just a normal char!

buffer = buffer.append(current);

// continue loop:

start++;

continue;

}

}

return buffer.toString();

}

// read from cs[start] util meet the specified char 'util',

// or null if not found:

private int readUtil(final char[] cs, final int start, final char util, final int maxLength) {

int end = start+maxLength;

if(end>cs.length)

end = cs.length;

for(int i=start; i

if(cs[i]==util) {

return i-start+1;

}

}

return (-1);

}

// compare standard tag ""

private boolean compareTag(final char[] ori_tag, char[] tag) {

if(ori_tag.length>=tag.length)

return false;

for(int i=0; i

if(Character.toLowerCase(tag[i])!=ori_tag[i])

return false;

}

// the following char should not be a-z:

if(tag.length>ori_tag.length) {

char c = Character.toLowerCase(tag[ori_tag.length]);

if(c'z')

return true;

return false;

}

return true;

}

private boolean compareString(final char[] ori, char[] comp) {

if(ori.length>comp.length)

return false;

for(int i=0; i

if(Character.toLowerCase(comp[i])!=ori[i])

return false;

}

return true;

}

public String toString() {

return html;

}

}

注意,请先将html中的

...部分提取出来,再交给WebFormatter处理,因为html->text转换实质是删除所有标签(某些标签如
被转化为'\n')、Script和注释,对于JavaScript生成的动态内容(例如document.write)无能为力。

posted on 2006-04-07 16:33 SIMONE 阅读(695) 评论(0)  编辑  收藏 所属分类: JAVA

java html2text_将HTML转化为TEXT的Java类相关推荐

  1. for linux pdf转mobi_pdftotext —— Linux/Unix中将PDF文件转化为Text文本格式的利器

    安装 pdftotext 到 RedHat / RHEL / Fedora / CentOS / Ubuntu 在不同的Linux分发版本中使用poppler-utils包安装pdftotext(Ce ...

  2. 解决Java.lang.NoClassDefFoundError:com/lowagie/text/Elemen的问题

    正在写用itext导出word的项目,在pom.xml里写了以下代码下载itext 2-1-7.jar. <dependency><groupId>com.lowagie< ...

  3. Java把一个文件转化为byte字节数组

    Java把一个文件转化为byte字节数组 /*** 把一个文件转化为byte字节数组.** @return*/private byte[] fileConvertToByteArray(File fi ...

  4. java 数组转化为arraylist_在Java中怎样把数组转换为ArrayList?

    本文分析了Stack Overflow上最热门的的一个问题的答案,提问者获得了很多声望点,使得他得到了在Stack Overflow上做很多事情的权限.这跟我没什么关系,我们还是先看看这个问题吧. 这 ...

  5. 面向 Java 开发人员的 Ajax: 构建动态的 Java 应用程序

    面向 Java 开发人员的 Ajax: 构建动态的 Java 应用程序 Ajax 为更好的 Web 应用程序铺平了道路 在 Web 应用程序开发中,页面重载循环是最大的一个使用障碍,对于 Java™ ...

  6. java中商业数据计算时用到的类BigDecimal和DecimalFormat

    1.引言 借用<Effactive Java>这本书中的话,float和double类型的主要设计目标是为了科学计算和工程计算.他们执行二进制浮点运算,这是为了在广域数值范围上提供较为精确 ...

  7. Java核心类库之(常用API、字符串类、集合类、泛型)

    目录 1 常用API 1.1 Math类 1.2 System类 1.3 Object类 1.4 Objects类 1.5 Arrays类 1.6 基本类型包装类 1.6.1 Integer类概述和使 ...

  8. java基础入门课后习题答案_《Java基础入门》课后习题及答案

    <Java基础入门>课后习题及答案Java基础入门,课后习题,答案 博学谷--让IT教学更简单,让IT学习更有效 第6章JavaAPI 一.填空题 1.在Java中定义了两个类来封装对字符 ...

  9. Java改知能机_Java 面试突击之 Java 并发知识基础 进阶考点全解析

    版权说明:本文内容根据 github 开源项目整理所得 项目地址:https://github.com/Snailclimb/JavaGuide​github.com 一.基础 什么是线程和进程? 何 ...

  10. java面试32问_学员分享:JAVA面试32问(11-20)

    第十一,short s1 = 1; s1 = s1 + 1;有什么错? short s1 = 1; s1 += 1;有什么错? short s1 = 1; s1 = s1 + 1;有错,s1是shor ...

最新文章

  1. MySQL like 通配符是_MySql模糊查询like通配符使用详细介绍
  2. 【Ubuntu】ubuntu更新设置
  3. Angular 路由时如何在 Component 之间传递参数
  4. 用最小二乘法拟合任意次函数曲线(C#)
  5. LeetCode 845. 数组中的最长山脉(中心扩展)
  6. POI操作EXCEL2007,报javax.xml.stream.XMLEventFactory.newFactory()错误!
  7. 吴恩达|机器学习作业8.0.异常检测
  8. 小蚂蚁学习数据结构(26)——题目——输出二叉树上值大于x的算法
  9. C语言中结构化数据(变量,指针,数组,字符串,结构体和联合)的内存表示
  10. ubuntu server 18.04 和 20.04 安装 RabbitMQ
  11. Unity V3 初步使用 —— 为我的.NET项目从简单三层架构转到IOC做准备
  12. arcgis中editor在哪_leetcode 刷题工具 leetcode-editor 本地调试篇
  13. mapxtreme java_用mapXtreme Java开发web gis应用 (下)
  14. IP归属地查询API
  15. 思科交换机配置试题_思科交换机基本配置命令全集
  16. 我读《非暴力沟通》- 马歇尔 *卢森堡 - 区分观察和评论
  17. mysql查询表_mysql数据库表的查询操作-总结
  18. siamfc-pytorch代码讲解(三):demotrack
  19. 如何实时计算日累计逐单资金流
  20. Apple Car将提前“出世”,华为、百度准备好了吗?

热门文章

  1. 从其他项目中复制过来的mapper加载不进bean_手把手带你玩转k8s-一键部署springboot项目...
  2. .net pdf转图片_如何将PDF转图片?PDF转图片免费方法!
  3. 一个借口几万条数据但是只返回十条_爬虫实践之爬取10000条菜谱数据
  4. gitlens突然不显示了_监控画面突然没有了怎么办?监控画面不显示了?
  5. jpadao层继承什么_1岁英短蓝白母猫能卖多少钱,银渐层2岁公猫多少钱
  6. android 新版本gradle,Android:更新到新版本的gradle后出现“Manife...
  7. exifinterface.setattribute设置不上去_电脑自动开机怎么设置
  8. 分布式,集群,冗余的理解
  9. PWA登陆iOS了,但它还有这些缺陷
  10. kaldi运行thchs30例子