java html2text_将HTML转化为TEXT的Java类
为了支持全文检索,有必要将HTML格式的文章转化为纯文本格式,因此我设计了一个基本的WebFormatter类,提供一个简单的public static String html2text(String html),将HTML格式转化为Text:
/*
* File: WebFormatter.java
* Created on 2005-6-24
* Author: Liao Xuefeng,
asklxf@163.com
* Copyright (C) 2005, Liao Xuefeng.
*/
package com.mboker.blog.web.util;
import java.util.*;
import java.text.SimpleDateFormat;
/**
* Do some format on web display.
*
* @author Xuefeng
*/
public class WebFormatter {
public static String html2text(String html) {
StringBuffer sb = new StringBuffer(html.length());
char[] data = html.toCharArray();
int start = 0;
boolean previousIsPre = false;
Token token = null;
for(;;) {
token = parse(data, start, previousIsPre);
if(token==null)
break;
previousIsPre = token.isPreTag();
sb = sb.append(token.getText());
start += token.getLength();
}
return sb.toString();
}
private static Token parse(char[] data, int start, boolean previousIsPre) {
if(start>=data.length)
return null;
// try to read next char:
char c = data[start];
if(c=='
// this is a tag or comment or script:
int end_index = indexOf(data, start+1, '>');
if(end_index==(-1)) {
// the left is all text!
return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
}
String s = new String(data, start, end_index-start+1);
// now we got s="<...>":
if(s.startsWith("");
if(end_comment_index==(-1)) {
// illegal end, but treat as comment:
return new Token(Token.TOKEN_COMMENT, data, start, data.length, previousIsPre);
}
else
return new Token(Token.TOKEN_COMMENT, data, start, end_comment_index+3, previousIsPre);
}
String s_lowerCase = s.toLowerCase();
if(s_lowerCase.startsWith("");
if(end_script_index==(-1))
// illegal end, but treat as script:
return new Token(Token.TOKEN_SCRIPT, data, start, data.length, previousIsPre);
else
return new Token(Token.TOKEN_SCRIPT, data, start, end_script_index+9, previousIsPre);
}
else { // this is a tag:
return new Token(Token.TOKEN_TAG, data, start, start+s.length(), previousIsPre);
}
}
// this is a text:
int next_tag_index = indexOf(data, start+1, '
if(next_tag_index==(-1))
return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
return new Token(Token.TOKEN_TEXT, data, start, next_tag_index, previousIsPre);
}
private static int indexOf(char[] data, int start, String s) {
char[] ss = s.toCharArray();
// TODO: performance can improve!
for(int i=start; i
// compare from data[i] with ss[0]:
boolean match = true;
for(int j=0; j
if(data[i+j]!=ss[j]) {
match = false;
break;
}
}
if(match)
return i;
}
return (-1);
}
private static int indexOf(char[] data, int start, char c) {
for(int i=start; i
if(data[i]==c)
return i;
}
return (-1);
}
}
class Token {
public static final int TOKEN_TEXT = 0; // html text.
public static final int TOKEN_COMMENT = 1; // comment like
public static final int TOKEN_TAG = 2; // tag like
,
public static final int TOKEN_SCRIPT = 3;
private static final char[] TAG_BR = "
private static final char[] TAG_P = "
private static final char[] TAG_LI = "
private static final char[] TAG_PRE = "
private static final char[] TAG_HR = "
private static final char[] END_TAG_TD = "".toCharArray();
private static final char[] END_TAG_TR = "".toCharArray();
private static final char[] END_TAG_LI = "
".toCharArray();
private static final Map SPECIAL_CHARS = new HashMap();
private int type;
private String html; // original html
private String text = null; // text!
private int length = 0; // html length
private boolean isPre = false; // isPre tag?
static {
SPECIAL_CHARS.put(""", "\"");
SPECIAL_CHARS.put("<", "
SPECIAL_CHARS.put(">", ">");
SPECIAL_CHARS.put("&", "&");
SPECIAL_CHARS.put("®", "(r)");
SPECIAL_CHARS.put("©", "(c)");
SPECIAL_CHARS.put(" ", " ");
SPECIAL_CHARS.put("£", "?");
}
public Token(int type, char[] data, int start, int end, boolean previousIsPre) {
this.type = type;
this.length = end - start;
this.html = new String(data, start, length);
System.out.println("[Token] html=" + html + ".");
parseText(previousIsPre);
System.out.println("[Token] text=" + text + ".");
}
public int getLength() {
return length;
}
public boolean isPreTag() {
return isPre;
}
private void parseText(boolean previousIsPre) {
if(type==TOKEN_TAG) {
char[] cs = html.toCharArray();
if(compareTag(TAG_BR, cs) || compareTag(TAG_P, cs))
text = "\n";
else if(compareTag(TAG_LI, cs))
text = "\n* ";
else if(compareTag(TAG_PRE, cs))
isPre = true;
else if(compareTag(TAG_HR, cs))
text = "\n--------\n";
else if(compareString(END_TAG_TD, cs))
text = "\t";
else if(compareString(END_TAG_TR, cs) || compareString(END_TAG_LI, cs))
text = "\n";
}
// text token:
else if(type==TOKEN_TEXT) {
text = toText(html, previousIsPre);
}
}
public String getText() {
return text==null ? "" : text;
}
private String toText(String html, final boolean isPre) {
char[] cs = html.toCharArray();
StringBuffer buffer = new StringBuffer(cs.length);
int start = 0;
boolean continueSpace = false;
char current, next;
for(;;) {
if(start>=cs.length)
break;
current = cs[start]; // read current char
if(start+1
next = cs[start+1];
else
next = '\0';
if(current==' ') {
if(isPre || !continueSpace)
buffer = buffer.append(' ');
continueSpace = true;
// continue loop:
start++;
continue;
}
// not ' ', so:
if(current=='\r' && next=='\n') {
if(isPre)
buffer = buffer.append('\n');
// continue loop:
start+=2;
continue;
}
if(current=='\n' || current=='\r') {
if(isPre)
buffer = buffer.append('\n');
// continue loop:
start++;
continue;
}
// cannot continue space:
continueSpace = false;
if(current=='&') {
// maybe special char:
int length = readUtil(cs, start, ';', 10);
if(length==(-1)) { // just '&':
buffer = buffer.append('&');
// continue loop:
start++;
continue;
}
else { // check if special character:
String spec = new String(cs, start, length);
String specChar = (String)SPECIAL_CHARS.get(spec);
if(specChar!=null) { // special chars!
buffer = buffer.append(specChar);
// continue loop:
start+=length;
continue;
}
else { // check if like 'Ӓ':
if(next=='#') { // maybe a char
String num = new String(cs, start+2, length-3);
try {
int code = Integer.parseInt(num);
if(code>0 && code<65536) { // this is a special char:
buffer = buffer.append((char)code);
// continue loop:
start++;
continue;
}
}
catch(Exception e) {}
// just normal char:
buffer = buffer.append("");
// continue loop:
start+=2;
continue;
}
else { // just '&':
buffer = buffer.append('&');
// continue loop:
start++;
continue;
}
}
}
}
else { // just a normal char!
buffer = buffer.append(current);
// continue loop:
start++;
continue;
}
}
return buffer.toString();
}
// read from cs[start] util meet the specified char 'util',
// or null if not found:
private int readUtil(final char[] cs, final int start, final char util, final int maxLength) {
int end = start+maxLength;
if(end>cs.length)
end = cs.length;
for(int i=start; i
if(cs[i]==util) {
return i-start+1;
}
}
return (-1);
}
// compare standard tag ""
private boolean compareTag(final char[] ori_tag, char[] tag) {
if(ori_tag.length>=tag.length)
return false;
for(int i=0; i
if(Character.toLowerCase(tag[i])!=ori_tag[i])
return false;
}
// the following char should not be a-z:
if(tag.length>ori_tag.length) {
char c = Character.toLowerCase(tag[ori_tag.length]);
if(c'z')
return true;
return false;
}
return true;
}
private boolean compareString(final char[] ori, char[] comp) {
if(ori.length>comp.length)
return false;
for(int i=0; i
if(Character.toLowerCase(comp[i])!=ori[i])
return false;
}
return true;
}
public String toString() {
return html;
}
}
注意,请先将html中的
...部分提取出来,再交给WebFormatter处理,因为html->text转换实质是删除所有标签(某些标签如
被转化为'\n')、Script和注释,对于JavaScript生成的动态内容(例如document.write)无能为力。
posted on 2006-04-07 16:33 SIMONE 阅读(695) 评论(0) 编辑 收藏 所属分类: JAVA
java html2text_将HTML转化为TEXT的Java类相关推荐
- for linux pdf转mobi_pdftotext —— Linux/Unix中将PDF文件转化为Text文本格式的利器
安装 pdftotext 到 RedHat / RHEL / Fedora / CentOS / Ubuntu 在不同的Linux分发版本中使用poppler-utils包安装pdftotext(Ce ...
- 解决Java.lang.NoClassDefFoundError:com/lowagie/text/Elemen的问题
正在写用itext导出word的项目,在pom.xml里写了以下代码下载itext 2-1-7.jar. <dependency><groupId>com.lowagie< ...
- Java把一个文件转化为byte字节数组
Java把一个文件转化为byte字节数组 /*** 把一个文件转化为byte字节数组.** @return*/private byte[] fileConvertToByteArray(File fi ...
- java 数组转化为arraylist_在Java中怎样把数组转换为ArrayList?
本文分析了Stack Overflow上最热门的的一个问题的答案,提问者获得了很多声望点,使得他得到了在Stack Overflow上做很多事情的权限.这跟我没什么关系,我们还是先看看这个问题吧. 这 ...
- 面向 Java 开发人员的 Ajax: 构建动态的 Java 应用程序
面向 Java 开发人员的 Ajax: 构建动态的 Java 应用程序 Ajax 为更好的 Web 应用程序铺平了道路 在 Web 应用程序开发中,页面重载循环是最大的一个使用障碍,对于 Java™ ...
- java中商业数据计算时用到的类BigDecimal和DecimalFormat
1.引言 借用<Effactive Java>这本书中的话,float和double类型的主要设计目标是为了科学计算和工程计算.他们执行二进制浮点运算,这是为了在广域数值范围上提供较为精确 ...
- Java核心类库之(常用API、字符串类、集合类、泛型)
目录 1 常用API 1.1 Math类 1.2 System类 1.3 Object类 1.4 Objects类 1.5 Arrays类 1.6 基本类型包装类 1.6.1 Integer类概述和使 ...
- java基础入门课后习题答案_《Java基础入门》课后习题及答案
<Java基础入门>课后习题及答案Java基础入门,课后习题,答案 博学谷--让IT教学更简单,让IT学习更有效 第6章JavaAPI 一.填空题 1.在Java中定义了两个类来封装对字符 ...
- Java改知能机_Java 面试突击之 Java 并发知识基础 进阶考点全解析
版权说明:本文内容根据 github 开源项目整理所得 项目地址:https://github.com/Snailclimb/JavaGuidegithub.com 一.基础 什么是线程和进程? 何 ...
- java面试32问_学员分享:JAVA面试32问(11-20)
第十一,short s1 = 1; s1 = s1 + 1;有什么错? short s1 = 1; s1 += 1;有什么错? short s1 = 1; s1 = s1 + 1;有错,s1是shor ...
最新文章
- MySQL like 通配符是_MySql模糊查询like通配符使用详细介绍
- 【Ubuntu】ubuntu更新设置
- Angular 路由时如何在 Component 之间传递参数
- 用最小二乘法拟合任意次函数曲线(C#)
- LeetCode 845. 数组中的最长山脉(中心扩展)
- POI操作EXCEL2007,报javax.xml.stream.XMLEventFactory.newFactory()错误!
- 吴恩达|机器学习作业8.0.异常检测
- 小蚂蚁学习数据结构(26)——题目——输出二叉树上值大于x的算法
- C语言中结构化数据(变量,指针,数组,字符串,结构体和联合)的内存表示
- ubuntu server 18.04 和 20.04 安装 RabbitMQ
- Unity V3 初步使用 —— 为我的.NET项目从简单三层架构转到IOC做准备
- arcgis中editor在哪_leetcode 刷题工具 leetcode-editor 本地调试篇
- mapxtreme java_用mapXtreme Java开发web gis应用 (下)
- IP归属地查询API
- 思科交换机配置试题_思科交换机基本配置命令全集
- 我读《非暴力沟通》- 马歇尔 *卢森堡 - 区分观察和评论
- mysql查询表_mysql数据库表的查询操作-总结
- siamfc-pytorch代码讲解(三):demotrack
- 如何实时计算日累计逐单资金流
- Apple Car将提前“出世”,华为、百度准备好了吗?
热门文章
- 从其他项目中复制过来的mapper加载不进bean_手把手带你玩转k8s-一键部署springboot项目...
- .net pdf转图片_如何将PDF转图片?PDF转图片免费方法!
- 一个借口几万条数据但是只返回十条_爬虫实践之爬取10000条菜谱数据
- gitlens突然不显示了_监控画面突然没有了怎么办?监控画面不显示了?
- jpadao层继承什么_1岁英短蓝白母猫能卖多少钱,银渐层2岁公猫多少钱
- android 新版本gradle,Android:更新到新版本的gradle后出现“Manife...
- exifinterface.setattribute设置不上去_电脑自动开机怎么设置
- 分布式,集群,冗余的理解
- PWA登陆iOS了,但它还有这些缺陷
- kaldi运行thchs30例子