htmaltotext
TRANSCRIPT
-
8/3/2019 htmalTOtext
1/5
Code :import java.io.File;import java.io.FileInputStream;
import java.io.Reader;import java.io.StringReader;
import java.io.IOException;
/*** Convert text/html into text/plain** Auther: Omindra Kumar Rana* Email: [email protected]** @version 1.0 $Date: May 10, 2005 $*/
public class HTML2Text
{ boolean body_found = false;boolean in_body = false;boolean center = false;boolean pre = false;String href = "";
public String convert(String source) throws Exception{
StringBuffer result = new StringBuffer();StringBuffer result2 = new StringBuffer();StringReader input = new StringReader(source);
try{String text = null;int c = input.read();
while (c != -1) // Convert until EOF{text = "";if (c == '
-
8/3/2019 htmalTOtext
2/5
else if (specialchar.equals("copy;") specialchar.equals("#169"))
text = "[Copyright]";else if (specialchar.equals("reg;") specialchar.equals("#174"
))text = "[Registered]";else if (specialchar.equals("trade;") specialchar.equals("#15
3"))text = "[Trademark]";elsetext = "&" + specialchar;}else if (!pre && Character.isWhitespace((char)c)){StringBuffer s = in_body ? result : result2;if (s.length() > 0 && Character.isWhitespace(s.charAt(s.length()
-1)))text = "";
else text = " ";
}else{text = "" + (char)c;}
StringBuffer s = in_body ? result : result2;s.append(text);
c = input.read();}}catch (Exception e)
{input.close();throw e;}
StringBuffer s = body_found ? result : result2;return s.toString().trim();
}
String getTag(Reader r) throws IOException{
StringBuffer result = new StringBuffer();int level = 1;
result.append('
-
8/3/2019 htmalTOtext
3/5
StringBuffer result = new StringBuffer();r.mark(1);//Mark the present position in the streamint c = r.read();
while (Character.isLetter((char)c)){result.append((char)c);
r.mark(1);c = r.read();}
if (c == ';') result.append(';');else r.reset();
return result.toString();}
boolean isTag(String s1, String s2){
s1 = s1.toLowerCase();String t1 = "";String t2 = "
-
8/3/2019 htmalTOtext
4/5
isTag(t,"/h3") isTag(t,"/h4") isTag(t,"/h5") isTag(t,"/h6") isTag(t,"/h7"))
result = "";
else if (isTag(t,"/dl"))result = "
";
else if (isTag(t,"dd"))result = "
* ";else if (isTag(t,"dt"))result = " ";else if (isTag(t,"li"))result = "
* ";else if (isTag(t,"/ul"))result = "
";else if (isTag(t,"/ol"))
result = "";else if (isTag(t,"hr"))result = "_________________________________________
";else if (isTag(t,"table"))result = "
";else if (isTag(t,"/table"))result = "
";else if (isTag(t,"form"))
result = "";
else if (isTag(t,"/form"))result = "
";else if (isTag(t,"b"))result = "*";else if (isTag(t,"/b"))result = "*";else if (isTag(t,"i"))result = """;else if (isTag(t,"/i"))result = """;else if (isTag(t,"img")){int idx = t.indexOf("alt="");if (idx != -1){idx += 5;int idx2 = t.indexOf(""",idx);result = t.substring(idx,idx2);}}else if (isTag(t,"a")){
int idx = t.indexOf("href="");if (idx != -1){
-
8/3/2019 htmalTOtext
5/5
idx += 6;int idx2 = t.indexOf(""",idx);href = t.substring(idx,idx2);}else{href = "";
}}else if (isTag(t,"/a")){if (href.length() > 0){result = " [ " + href + " ]";href = "";}}
return result;
}
public static void main(String argv[]) throws Exception{
FileInputStream fis = null;String s = null;
try{File file;if (argv[0] != null) file = new File(argv[0]);else file = new File("html_test_file.html");fis = new FileInputStream(file);
byte buf[] = new byte[fis.available()];//bytes that can be read from this file input stream without blo
cking
fis.read(buf);fis.close();fis = null;s = new String(buf);HTML2Text h = new HTML2Text();System.out.println(h.convert(s));}catch (Exception e){if (fis != null) fis.close();throw e;}
}}