JSoup小例子
- 374 次检阅

emmmm...好像没有什么要备注的就是自己练手怕信息吧?


package com.htjf.main;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class HelloWordJSoup {
	public static void main(String[] args) {
		for(int i=1;i<=100;i++){
		try {
			String url = "http://www.ybzhan.cn/Company/a_t0/list_p"+i+".html";
			System.out.println(url);
			Document doc = Jsoup.connect(url).get();
			Elements companyLists =  doc.select(".companyList");
			StringBuffer stringBuffer=new StringBuffer();
			for (Element companyList : companyLists) {
				//公司名
				Element companyNameDiv =  companyList.select("div.companyName").first();
				Element link  = companyNameDiv.select("a").first();
				
				String shopUrl =  "http://www.ybzhan.cn"+link.attr("href");
				String companyName = link.text();
				//主营产品
				Element ps =  companyList.select("dt > p").first();
				String mainProducts = ps.text().replace("主营产品", "");
				//介绍页面
				String personalityUrl = "";
				String shopUrl2 = "";
				String companyUrl = "";
				System.out.println(shopUrl);
				if(!StringUtil.isBlank(shopUrl)&&!shopUrl.contains("Company/Detail")){
					Document contactusDoc= Jsoup.connect( shopUrl+"/contactus.html").get();
					Elements ss = contactusDoc.getElementsByTag("p");
					for (Element element : ss) {
						if(element.text().contains("个 性 化")){
							personalityUrl = element.text();
						}else if(element.text().contains("商铺网址")){
							shopUrl2 = element.text();
						}else if(element.text().contains("公司网站")){
							companyUrl = element.text();
						}
					}
					if(StringUtil.isBlank(personalityUrl)||StringUtil.isBlank(shopUrl2)||StringUtil.isBlank(companyUrl)){
						Elements dl = contactusDoc.getElementsByTag("dl");
						for (Element element : dl) {
							if(element.text().contains("个 性 化")){
								if(StringUtil.isBlank(personalityUrl)){
									personalityUrl = element.text();
								}
							}else if(element.text().contains("商铺网址")){
								if(StringUtil.isBlank(shopUrl2)){
									shopUrl2 = element.text();
								}
							}else if(element.text().contains("公司网站")){
								if(StringUtil.isBlank(companyUrl)){
									companyUrl = element.text();
								}
							}
						}
					}
				}
				//写入
				stringBuffer.append(companyName+";");
				stringBuffer.append(mainProducts+";");
				if(StringUtil.isBlank(shopUrl2.trim())){
					stringBuffer.append(shopUrl+";");
				}else{
					stringBuffer.append(shopUrl2.replace("商铺网址:", "").trim()+";");
				}
				if(StringUtil.isBlank(companyUrl.trim())){
					stringBuffer.append(shopUrl+";");
				}else{
					stringBuffer.append(companyUrl.replace("公司网站:", "").trim()+";");
				}
				if(StringUtil.isBlank(personalityUrl.trim())){
					stringBuffer.append(shopUrl);
				}else{
					stringBuffer.append(personalityUrl.replace("个 性 化:", "").trim());
				}
				stringBuffer.append(System.lineSeparator());//换行
			}
			new HelloWordJSoup().writerData(stringBuffer);
			stringBuffer.setLength(0);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		}
	}
	public void writerData(StringBuffer stringBuffer){
		FileWriter out = null;
		String fileName = "G:"+File.separator+"pushFile_test"+File.separator+"data.csv";
		File writeFile  = new File(fileName); //文件路径名
		if(!writeFile.exists()&&!writeFile.isFile()){// 如果文件不存在,创建文件
			try {
				writeFile.createNewFile();
			} catch (IOException e) {
				// TODO Auto-generated catch block
			}
		}
		
		try {
			out = new FileWriter(writeFile,true);
			if(stringBuffer.length()>0){
				out.write(stringBuffer.toString());
			}
			out.flush();
			out.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		stringBuffer.setLength(0);
	} 
}

结束...

分享到:

这篇文章还没有评论

发表评论