java通过Jsoup爬取中国知网(cnki)(二)自动获取

Shaka 5天前 ⋅ 7 阅读

 

List periodicalList = new ArrayList();
String reqStr = putTogether(filterTerm);
logger.info("https://kns.cnki.net/kns/request/SearchHandler.ashx?" + reqStr);
//原链接重定向
pageConn.url("https://kns.cnki.net/kns/request/SearchHandler.ashx?" + reqStr)
.method(Connection.Method.POST)
.referrer("https://kns.cnki.net/kns/brief/result.aspx?dbprefix=CFLS&crossDbcodes=CJFQ,CDFD,CMFD,CPFD,IPFD,CCND,CCJD").post().text();
//返回消息ASP.brief_result_aspx&isinEn=1&dbPrefix=SCDB&dbCatalog=%e4%b8%ad%e5%9b%bd%e5%ad%a6%e6%9c%af%e6%96%87%e7%8c%ae%e7%bd%91%e7%bb%9c%e5%87%ba%e7%89%88%e6%80%bb%e5%ba%93&ConfigFile=SCDB.xml&research=off
//获取请求cookies
Map<String, String> cookies = pageConn.response().cookies();
String url = "http://kns.cnki.net/kns/brief/brief.aspx?pagename=ASP.brief_result_aspx&dbPrefix=CFLS&DisplayMode=custommode";
//新建连接添加cookies
Connection con = Jsoup.connect(url);
con.cookies(cookies);
Document doc = con.get();

String pagerTitle = "";
if(doc.getElementsByClass("pagerTitleCell").size()>0){
if(doc.getElementsByClass("pagerTitleCell").get(0)!=null){
pagerTitle = doc.getElementsByClass("pagerTitleCell").get(0).text();
}
}
logger.info(filterTerm.getTradeName()+"-------"+filterTerm.getGeneralName()+":"+pagerTitle);
//获取列表页数
Elements countPage = doc.getElementsByClass("countPageMark");
if (countPage != null) {
int pageCount;
if (countPage.size() > 0) {
if (countPage.get(0).text().contains("/")) {
pageCount = Integer.parseInt(countPage.get(0).text().split("/")[1]);
} else {
pageCount = 1;
}
} else {
pageCount = 1;
}
Element nextPage = doc.getElementById("Page_next");
logger.info("获取页数:" + pageCount);
//循环页数
for (int i = 1; i <= pageCount; i++) {
if (i != 1) {
url = "http://kns.cnki.net/kns/brief/brief.aspx" + nextPage.attr("href");
}
Connection pageCon = Jsoup.connect(url);
pageCon.timeout(50000);
cookies.put("RsPerPage", "50");
pageCon.cookies(cookies);
doc = pageCon.get();
int docByPage = doc.getElementsByClass("title_c").size();
//循环每页数据
for (int j = 0; j < docByPage; j++) {
String title = doc.getElementsByClass("title_c").get(j).text();
String author = doc.getElementsByClass("author").get(j).text();
String journal = doc.getElementsByClass("journal").get(j).text();

//进入详情页,获取全部摘要,也可以获取其他信息
String link = doc.getElementsByClass("title_c").get(j).getElementsByTag("a").attr("href");
Document detailDoc = pageConn.url("https://kns.cnki.net" + link).method(Connection.Method.GET).referrer("https://kns.cnki.net" + link).get();
logger.info("https://kns.cnki.net" + link);
String abstract_c = "";
if(detailDoc.getElementById("ChDivSummary")!=null){
abstract_c = detailDoc.getElementById("ChDivSummary").text();
}
//因为开始1000多条未获取到知网地址保存,所以进行了双验证,1.验证网址,2.验证篇名+作者.
TrPeriodical per = getByLink(link, filterTerm.getCorp(), filterTerm.getTradeName(),"知网");
TrPeriodical per1 = getByTitleAndAuthor(title,author, filterTerm.getCorp(), filterTerm.getTradeName(),"知网");
if (per == null && per1==null) {
per = new TrPeriodical();
per.setTitle(title);
per.setSearchCompany(filterTerm.getCorp());
per.setLinkUrl("https://kns.cnki.net" + link);
per.setAbstractText(abstract_c);
per.setDrugName(filterTerm.getTradeName());
per.setGeneralName(filterTerm.getGeneralName());
per.setProductCategory(filterTerm.getProductCategory());
per.setAuthor(author);
per.setJournal(journal);
per.setSourceWeb("知网");
per.setCreated(new Date());
periodicalList.add(per);
} else {
logger.info("知网重复记录跳过," + "第" + i + "页第" + j + "条数据,文件名为:" + title);
}
}
}
} else {
int docByPage = doc.getElementsByClass("title_c").size();
//循环每页数据
for (int j = 0; j < docByPage; j++) {
String title = doc.getElementsByClass("title_c").get(j).text();
String author = doc.getElementsByClass("author").get(j).text();
String journal = doc.getElementsByClass("journal").get(j).text();
//进入详情页,获取全部摘要,也可以获取其他信息
String link = doc.getElementsByClass("title_c").get(j).getElementsByTag("a").attr("href");
Document detailDoc = con.url("https://kns.cnki.net" + link).method(Connection.Method.GET).referrer("https://kns.cnki.net" + link).get();
String abstract_c = detailDoc.getElementById("ChDivSummary").text();
//因为开始1000多条未获取到知网地址保存,所以进行了双验证,1.验证网址,2.验证篇名+作者.
TrPeriodical per = getByLink(link, filterTerm.getCorp(), filterTerm.getTradeName(),"知网");
TrPeriodical per1 = getByTitleAndAuthor(title, author, filterTerm.getCorp(), filterTerm.getTradeName(), "知网");
if (per == null && per1==null) {
per = new TrPeriodical();
per.setTitle(title);
per.setSearchCompany(corp);
per.setLinkUrl("https://kns.cnki.net" + link);
per.setAbstractText(abstract_c);
per.setDrugName(filterTerm.getTradeName());
per.setGeneralName(filterTerm.getTradeName());
per.setProductCategory(filterTerm.getProductCategory());
per.setAuthor(author);
per.setJournal(journal);
per.setSourceWeb("知网");
per.setCreated(new Date());
periodicalList.add(per);
} else {
logger.info("重复记录跳过," + "第1页第" + j + "条数据,文件名为:" + title);
}
}
}

private String putTogether(TrFilterTerm filterTerm) throws UnsupportedEncodingException {
Date now = new Date();
//跨年需要
Calendar calendar = Calendar.getInstance();
String endYear = String.valueOf(calendar.get(Calendar.YEAR));
String endMonth = String.valueOf(calendar.get(Calendar.MONTH)+1);
String endDay = String.valueOf(calendar.get(Calendar.DAY_OF_MONTH));
Calendar search = Calendar.getInstance();
search.add(Calendar.DATE, -60);
String beginYear = String.valueOf(search.get(Calendar.YEAR));
String beginMonth = String.valueOf(search.get(Calendar.MONTH));
String beginDay = String.valueOf(search.get(Calendar.DAY_OF_MONTH));
if(!StringUtils.equals(beginYear,endYear)){
beginYear = endYear;
beginMonth = "01";
beginDay = "17";
}

SimpleDateFormat format = new SimpleDateFormat("EEE MMM dd yyyy hh:mm:ss", Locale.ENGLISH);
String nowString = format.format(now)+" GMT+0800 (中国标准时间)";
StringBuilder request = new StringBuilder("action=&NaviCode=*&ua=1.21&isinEn=1&PageName=ASP.brief_result_aspx&DbPrefix=CFLS&DbCatalog=%e4%b8%ad%e5%9b%bd%e5%ad%a6%e6%9c%af%e6%96%87%e7%8c%ae%e7%bd%91%e7%bb%9c%e5%87%ba%e7%89%88%e6%80%bb%e5%ba%93&ConfigFile=SCDB.xml&db_opt=CJFQ%2CCDFD%2CCMFD%2CCPFD%2CIPFD%2CCCND%2CCCJD");
request.append("&publishdate_from=");
request.append(beginYear).append("-").append(beginMonth).append("-").append(beginDay);
request.append("&publishdate_to=");
request.append(endYear).append("-").append(endMonth).append("-").append(endDay);
if(filterTerm.getCorp().contains("XXX医药")){
request.append("&txt_1_sel=AB%24%25%3D%7C");
}else{
request.append("&txt_1_sel=SU%24%25%3D%7C");
}
request.append("&txt_1_value1=");
request.append(URLEncoder.encode(StringUtils.isNotBlank(filterTerm.getTradeName())?filterTerm.getTradeName():filterTerm.getGeneralName(),"UTF-8"));
if(StringUtils.isNotBlank(filterTerm.getTradeName())
&&StringUtils.isNotBlank(filterTerm.getGeneralName())){
request.append("&txt_1_value2=");
request.append(URLEncoder.encode(filterTerm.getGeneralName(), "UTF-8"));
}
request.append("&txt_1_relation=%23CNKI_OR");
request.append("&txt_1_special1=%3D");
request.append("&txt_2_sel=TI");
request.append("&txt_2_value1=");
request.append(URLEncoder.encode(StringUtils.isNotBlank(filterTerm.getTradeName())?filterTerm.getTradeName():filterTerm.getGeneralName(),"UTF-8"));
if(StringUtils.isNotBlank(filterTerm.getTradeName())
&&StringUtils.isNotBlank(filterTerm.getGeneralName())){
request.append("&txt_2_value2=");
request.append(URLEncoder.encode(filterTerm.getGeneralName(), "UTF-8"));
}
request.append("&txt_2_logical=or");
request.append("&txt_2_relation=%23CNKI_OR");
request.append("&txt_2_special1=%3D");
request.append("&txt_3_sel=KY");
request.append("&txt_3_value1=");
request.append(URLEncoder.encode(StringUtils.isNotBlank(filterTerm.getTradeName())?filterTerm.getTradeName():filterTerm.getGeneralName(),"UTF-8"));
if(StringUtils.isNotBlank(filterTerm.getTradeName())
&&StringUtils.isNotBlank(filterTerm.getGeneralName())){
request.append("&txt_3_value2=");
request.append(URLEncoder.encode(filterTerm.getGeneralName(), "UTF-8"));
}
request.append("&txt_3_logical=or");
request.append("&txt_3_relation=%23CNKI_OR");
request.append("&txt_3_special1=%3D");
request.append("&his=0&__=");
request.append(URLEncoder.encode(nowString));
return request.toString();
}

注意:本文归作者所有,未经作者允许,不得转载

全部评论: 0

    我有话说: