package com.example.mht;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.StringTokenizer;
import javax.activation.DataHandler;
import javax.activation.DataSource;
import javax.activation.MimetypesFileTypeMap;
import javax.mail.Message;
import javax.mail.Session;
import javax.mail.internet.InternetAddress;
import javax.mail.internet.MimeBodyPart;
import javax.mail.internet.MimeMessage;
import javax.mail.internet.MimeMultipart;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
{
;
;
;
;
;
String to;
;
String cc;
String bcc;
;
{
;
;
downFileTxt(strUrl, strEncoding);
(strText == )
;
(strUrl, strEncoding, );
h2t.compile();
}
{
{
strWeb = (strUrl);
} (MalformedURLException e) {
e.printStackTrace();
;
}
.strText = downFileTxt(strUrl, strEncoding);
.strEncoding = strEncoding;
.strFileName = strFileName;
}
{
(strWeb == || strText == || strFileName == || strEncoding == )
;
();
();
{
createParser(strText);
parser.setEncoding(strEncoding);
nodes = parser.parse();
} (ParserException e) {
e.printStackTrace();
}
extractAllScriptNodes(nodes);
extractAllScriptNodes(nodes, urlMap);
extractAllImageNodes(nodes, urlMap);
( urlMap.entrySet().iterator(); iter.hasNext();) {
Map. (Map.Entry) iter.next();
(String) entry.getKey();
(String) entry.getValue();
strText = strText.replaceAll(val, key);
}
{
createMhtArchive(strText, urlScriptList, urlImageList);
} (Exception e) {
e.printStackTrace();
;
}
;
}
Parser {
( (inputHTML));
(mLexer, (DefaultParserFeedback.QUIET));
}
{
nodes.extractAllNodesThatMatch( (), );
(filtered != && filtered.size() > ) {
(Tag) filtered.elementAt();
tag.getAttribute();
(href != && href.length() > ) {
{
strWeb = (href);
} (MalformedURLException e) {
e.printStackTrace();
}
}
}
}
ArrayList {
();
nodes.extractAllNodesThatMatch( (), );
( ; i < filtered.size(); i++) {
(Tag) filtered.elementAt(i);
tag.getAttribute();
(src != && src.length() > ) {
src;
makeAbsoluteURL(strWeb, innerURL);
(absoluteURL != && !urlMap.containsKey(absoluteURL)) {
urlMap.put(absoluteURL, innerURL);
();
urlInfo.add(innerURL);
urlInfo.add(absoluteURL);
urlList.add(urlInfo);
}
tag.setAttribute(, absoluteURL);
}
}
filtered = nodes.extractAllNodesThatMatch( (), );
( ; i < filtered.size(); i++) {
(Tag) filtered.elementAt(i);
(tag.getAttribute());
(tag.getAttribute());
tag.getAttribute();
;
(rel != ) {
isCssFile = rel.indexOf() != -;
} (type != ) {
isCssFile |= type.indexOf() != -;
}
(isCssFile && href != && href.length() > ) {
href;
makeAbsoluteURL(strWeb, innerURL);
(absoluteURL != && !urlMap.containsKey(absoluteURL)) {
urlMap.put(absoluteURL, innerURL);
();
urlInfo.add(innerURL);
urlInfo.add(absoluteURL);
urlList.add(urlInfo);
}
tag.setAttribute(, absoluteURL);
}
}
urlList;
}
ArrayList {
();
nodes.extractAllNodesThatMatch( (), );
( ; i < filtered.size(); i++) {
(Tag) filtered.elementAt(i);
tag.getAttribute();
(src != && src.length() > ) {
src;
makeAbsoluteURL(strWeb, innerURL);
(absoluteURL != && !urlMap.containsKey(absoluteURL)) {
urlMap.put(absoluteURL, innerURL);
();
urlInfo.add(innerURL);
urlInfo.add(absoluteURL);
urlList.add(urlInfo);
}
tag.setAttribute(, absoluteURL);
}
}
urlList;
}
String {
(innerURL !=
&& innerURL.toLowerCase().indexOf() == ) {
innerURL;
}
;
{
linkUri = (strWeb, innerURL);
} (MalformedURLException e) {
e.printStackTrace();
;
}
linkUri.toString();
absURL;
}
Exception {
();
();
props.put(, smtp);
Session.getDefaultInstance(props, );
(session);
msg.setHeader(, );
(from != ) {
msg.setFrom( (from));
}
(subject != ) {
msg.setSubject(subject);
}
(to != ) {
InternetAddress[] toAddresses = getInetAddresses(to);
msg.setRecipients(Message.RecipientType.TO, toAddresses);
}
(cc != ) {
InternetAddress[] ccAddresses = getInetAddresses(cc);
msg.setRecipients(Message.RecipientType.CC, ccAddresses);
}
(bcc != ) {
InternetAddress[] bccAddresses = getInetAddresses(bcc);
msg.setRecipients(Message.RecipientType.BCC, bccAddresses);
}
();
bp.setText(content, strEncoding);
bp.addHeader(, + strEncoding);
bp.addHeader(, strWeb.toString());
mp.addBodyPart(bp);
urlScriptList.size();
( ; i < urlCount; i++) {
bp = ();
(ArrayList) urlScriptList.get(i);
urlInfo.get().toString();
bp.addHeader(, javax.mail.internet.MimeUtility.encodeWord(java.net.URLDecoder.decode(absoluteURL, strEncoding)));
(absoluteURL, );
bp.setDataHandler( (source));
mp.addBodyPart(bp);
}
urlCount = urlImageList.size();
( ; i < urlCount; i++) {
bp = ();
(ArrayList) urlImageList.get(i);
urlInfo.get().toString();
bp.addHeader(, javax.mail.internet.MimeUtility.encodeWord(java.net.URLDecoder.decode(absoluteURL, strEncoding)));
(absoluteURL, );
bp.setDataHandler( (source));
mp.addBodyPart(bp);
}
msg.setContent(mp);
(strFileName);
(!dir.exists())
dir.createNewFile();
msg.writeTo( (strFileName));
}
String {
;
(strName.lastIndexOf(separator) >= )
format(strName.substring(strName.lastIndexOf(separator) + ));
+ ID;
}
String {
(strName == )
;
strName = strName.replaceAll(, );
;
( ; i < strName.length(); ++i) {
String.valueOf(strName.charAt(i));
(strText.indexOf(ch) != -) {
strName = strName.replace(strName.charAt(i), );
}
}
strName;
}
String {
;
;
;
;
{
(jspUrl);
textStream = url.openStream();
buff = (textStream);
r = (buff, strEncoding);
br = (r);
();
;
((strLine = br.readLine()) != ) {
strHtml.append(strLine + );
}
br.close();
r.close();
textStream.close();
strHtml.toString();
} (Exception e) {
e.printStackTrace();
} {
{
(br != )
br.close();
(buff != )
buff.close();
(textStream != )
textStream.close();
} (Exception e) {
System.out.println();
}
}
;
}
[] downFileByte(String jspUrl) {
;
;
[] buf = ;
{
();
(jspUrl);
textStream = url.openStream();
buff = (textStream);
;
((ch = buff.read()) != -) {
byteArray.write(ch);
}
buf = byteArray.toByteArray();
buff.close();
textStream.close();
} (Exception e) {
e.printStackTrace();
} {
{
(buff != )
buff.close();
(textStream != )
textStream.close();
} (Exception e) {
System.out.println();
}
}
buf;
}
InternetAddress[] getInetAddresses(String emails) Exception {
();
(emails, );
(tok.hasMoreTokens()) {
list.add(tok.nextToken());
}
list.size();
InternetAddress[] addresses = [count];
( ; i < count; i++) {
addresses[i] = (list.get(i).toString());
}
addresses;
}
{
();
String strUrl;
String strType;
[] dataSize = ;
();
{
normalMap.put(, );
normalMap.put(, );
}
{
.strType = strType;
.strUrl = strUrl;
strUrl = strUrl.trim();
strUrl = strUrl.replaceAll(, );
(.equals(strType))
dataSize = downFileTxt(strUrl, strEncoding).getBytes();
(.equals(strType))
dataSize = downFileByte(strUrl);
}
String {
getMimeType(getName());
}
String {
File.separatorChar;
(strUrl.lastIndexOf(separator) >= )
strUrl.substring(strUrl.lastIndexOf(separator) + );
strUrl;
}
String {
(String) normalMap.get(strType);
(type == ) {
{
type = map.getContentType(fileName);
} (Exception e) {
}
(type == ) {
type = ;
}
}
type;
}
InputStream IOException {
(dataSize == )
dataSize = [];
(dataSize);
}
OutputStream IOException {
.io.ByteArrayOutputStream();
}
}
}
Javajava
Java 实现网页内容转换为 MHT 文件格式代码
Java 网页转 MHT 工具类利用 JavaMail 与 HtmlParser 解析 HTML 结构,提取图片与脚本资源,将其打包为符合 MIME 标准的 MHT 文件。代码涵盖 URL 绝对化、资源下载流处理及附件数据源封装,支持自定义编码与邮件头设置,适用于网页归档需求。

