已知网页源代码-获取文件中的url

package com.zx.cn.dao;import java.io.BufferedWriter;import java.io.File;import java.io.FileWriter;import java.io.IOException;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/*** 已知youku源码文件,获取文件中的url* @author yltd**/public class HtmlGetUrl {public static void main(String[] args) {//已知youku的源码getLocalFile("C:\\Users\\yltd\\Desktop\\login.html");}/*** 截取指定字段*/public static String  subString(String str, String strStart, String strEnd) {/* 找出指定的2个字符在 该字符串里面的 位置 */int strStartIndex = str.indexOf(strStart)+3;int strEndIndex = str.indexOf(strEnd);/* index为负数 即表示该字符串中没有该字符 *//* 开始截取 */String result = str.substring(strStartIndex, strEndIndex);return result;}/*** 将数据(字符串)写入文档。* @param line* @param toFilePath*/private static void toFile(String line, String toFilePath) {File des = new File(toFilePath);if (!des.exists()) { // 判断是否存在,不存在就创建try {// 创建文件des.createNewFile();} catch (IOException e) {e.printStackTrace();}}BufferedWriter writer;try {writer = new BufferedWriter(new FileWriter(toFilePath));System.out.println("line = "+line);writer.write(line);writer.newLine();writer.close();} catch (IOException e) {e.printStackTrace();}}private static void getLocalFile(String localFilePath) {StringBuffer linkSB=new StringBuffer();Document doc =null;if(localFilePath!=null && localFilePath!="") {try {File input = new File(localFilePath);doc = Jsoup.parse(input, "UTF-8", "");} catch (IOException e) {e.printStackTrace();}}//数据提取//Element content = doc.getElementById("content");//Elements links = content.getElementsByTag("a");//数据提取Elements links = doc.select("div.p-thumb").select("a[href]"); //div class:p-thumb 下的带有href属性的a元素for (Element link : links) {String linkHref = link.attr("href");//获取href中的数据。//截取id 拼接真实路径String id = subString(linkHref, "id_", ".html");String realURL = "https://v.youku.com/v_show/id_"+id+".html";System.out.println("realURL = " +realURL);linkSB.append(realURL);linkSB.append("\r\n");}toFile(linkSB.toString(), "D:\\url.txt");}}可能用到的 jar包:commons-codec-1.10.jarcommons-httpclient-3.1.jarcommons-io-2.6.jarcommons-logging-1.2.jarhttpclient-4.5.6.jarjson-rpc-1.0.jarjsoup-1.12.1.jarjuniversalchardet-1.0.3.jar

(0)

相关推荐