java多线程爬论文
学习笔记仅供参考
1.xiancheng.class
package com.example.util;import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import java.io.*;
import java.net.*;import java.util.ArrayList;
import java.util.Random;public class xiancheng implements Runnable {static ArrayList<String> ips = new ArrayList<String>();public ArrayList<LW> lws = new ArrayList<>();static xiancheng xian = new xiancheng();static String name="";int q = 0;public xiancheng(){}public xiancheng(ArrayList<String> ipss, int pages) {ips = ipss;q = pages;}public ArrayList<LW>getlws(){return lws;}public String getip() throws InterruptedException {System.getProperties().setProperty("http.proxuHost","113.140.84.97");System.getProperties().setProperty("http.proxyPort","80");String ip = new pachong().ips1().get(0);System.out.println("IP是:"+ip);return ip;}@Overridepublic void run() {Random x = new Random();String path = "";// String [] ips = {"120.79.64.147","175.148.74.121","139.9.195.202","60.167.132.223"};
// String [] ports = {"8118","1133","8118","808"};
// System.getProperties().setProperty("http.proxyHost", ips[q%5]);
// System.getProperties().setProperty("http.proxyPort", ports[q%5]);
// System.err.println(ips[q%5]+":"+ports[q%5]);try {path = "http://qikan.chaoxing.com/searchjour?sw=" + URLEncoder.encode("西北农林科技大学", "utf-8") + "&stryear=17&nosim=1&size=50&x=0_900&pages=" + String.valueOf(q);} catch (UnsupportedEncodingException e) {e.printStackTrace();}q++;Random r = new Random();String[] ua = {"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586","Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko","Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7","Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0"};int k = r.nextInt(14);String ip = null;Document doc = null;try {doc = Jsoup.connect(path).timeout(10000).ignoreHttpErrors(true).userAgent(ua[k])
// .cookie("msign_dsr", "1592578548328")
// .cookie("search_uuid", "59d60d20%2d888c%2d4339%2d893c%2d3f2df0b416a9")
// .cookie("UM_distinctid", "172cd13faab18a-0cc175a1ccab1-3c3f5a0c-1fa400-172cd13faac61e")
// .cookie("__dxca", "7959aeaa-2cb3-44b4-ac6d-5a19f2ac1261")
// .cookie("lv", "0")
// .cookie("chaoxinguser", "1")
// .cookie("uname", "")
// .cookie("_uid", "143680086")
// .cookie("uf", "f9866f9a46b70622b0dee5ad98478a8391831cfec1654dd57a4fb73fc6375849f7893fe5a7d754cdff17dba0ec4416549b0594e13f4b452fbdd6b93a431584911471850d8bf7e34c24848f3d3f307228570bdddc4e9885c5")
// .cookie("_d", "1592928430294")
// .cookie("UID", "143680086")
// .cookie("vc", "A5267B905B437CE795E472D324AD65E0")
// .cookie("vc2", "C473E4E0C10364B7C0A31DE46070C411")
// .cookie("vc3", "C9JC%2Ff8LUHJ%2FbURcxo1vggUNgTmarTCVJFGMM3kP7MoczjjCxUPi89UGJjjLU4sQ6WNHQgZOUN1mj6sDYiMWNCcwVw%2BMLRe1D3y5d%2FofpoF2AMh3LToyTKUsC9ykkTfsWnWiDPL32C%2Bziy%2BnJFuaW5KB9%2FlDulddw%2B9thByI%2Bvg%3Daaeb61274732c450c6a62c27f9001cce")
// .cookie("xxtenc", "3fa0ee7c7d4f9d548e4bc1d7336c838c")
// .cookie("duxiu", "userName%5fdsr%2c%3dxbnlkjdx%2c%21userid%5fdsr%2c%3d1406%2c%21char%5fdsr%2c%3d%u78cb%2c%21metaType%2c%3d353%2c%21dsr%5ffrom%2c%3d1%2c%21logo%5fdsr%2c%3dlogo0408%2ejpg%2c%21logosmall%5fdsr%2c%3dsmall0408%2ejpg%2c%21title%5fdsr%2c%3d%u897f%u5317%u519c%u6797%u79d1%u6280%u5927%u5b66%2c%21url%5fdsr%2c%3debook%2c%21compcode%5fdsr%2c%3d1176%2c%21province%5fdsr%2c%3d%u9655%u897f%2c%21readDom%2c%3d0%2c%21isdomain%2c%3d61707%2c%21showcol%2c%3d0%2c%21hu%2c%3d0%2c%21uscol%2c%3d0%2c%21isfirst%2c%3d0%2c%21istest%2c%3d0%2c%21cdb%2c%3d0%2c%21og%2c%3d1%2c%21ogvalue%2c%3d7%2c%21testornot%2c%3d1%2c%21remind%2c%3d0%2c%21datecount%2c%3d3478%2c%21userIPType%2c%3d2%2c%21lt%2c%3d0%2c%21ttt%2c%3dfxlogin%2echaoxing%2c%21enc%5fdsr%2c%3dD5E78CB6EBD2A70090256E4C463A3566").get();} catch (IOException e) {return;}Elements subjects = doc.select("#liebiaoDivId tr");Elements authors = doc.select("#liebiaoDivId tr");Elements downloadnum = doc.select("#liebiaoDivId tr");Elements quotenum = doc.select("#liebiaoDivId tr");String[] pathes = new String[50];String[] quotenums = new String[50];String[] downloadnums = new String[50];String[] periodicals = new String[50];String[] publishtimes = new String[50];String[] funds = new String[50];String[][] keys = new String[50][8];String[] units = new String[50];String[] abstracts = new String[50];for (int i = 0; i < 50; i++) {pathes[i] = "http://qikan.chaoxing.com" + subjects.get(i + 1).child(1).child(0).attr("href").trim();quotenums[i] = quotenum.get(i + 1).child(6).child(0).text();downloadnums[i] = downloadnum.get(i + 1).child(7).child(0).text();periodicals[i] = downloadnum.get(i + 1).child(3).child(0).text();if (downloadnum.get(i + 1).child(5).child(0).text().length() == 8) {publishtimes[i] = downloadnum.get(i + 1).child(5).child(0).text().substring(0, 4) + "-0" + String.valueOf(Integer.parseInt(String.valueOf(downloadnum.get(i + 1).child(5).child(0).text().charAt(6))));} else if (downloadnum.get(i + 1).child(5).child(0).text().length() > 8 && (Integer.valueOf(downloadnum.get(i + 1).child(5).child(0).text().charAt(6)) <= 9) && (Integer.valueOf(downloadnum.get(i + 1).child(5).child(0).text().charAt(6)) >= 0)) {publishtimes[i] = downloadnum.get(i + 1).child(5).child(0).text().substring(0, 4) + "-" + String.valueOf(Integer.valueOf(downloadnum.get(i + 1).child(5).child(0).text().substring(6, 8)) / 2);} else {publishtimes[i] = "1";}//System.out.println(publishtimes[i]);//System.out.println(periodicals[i]);//System.out.println(downloadnums[i]);//System.out.println(quotenums[i]);//System.out.println(pathes[i]);}for (int i = 0; i < 50; i++) {if (i % 10 == 0) {try {ip = xian.getip();} catch (InterruptedException e) {e.printStackTrace();}String[] r1 = ip.split(":");System.out.println(ip);System.getProperties().setProperty("http.proxyHost", r1[0]);System.getProperties().setProperty("http.proxyPort", r1[1]);System.err.println(r1[0] + ":" + r1[1]);}Document doc1 = null;try {doc1 = Jsoup.connect(pathes[i]).timeout(10000).ignoreHttpErrors(true).userAgent(ua[k])
// .cookie("msign_dsr", "1592578548328")
// .cookie("search_uuid", "59d60d20%2d888c%2d4339%2d893c%2d3f2df0b416a9")
// .cookie("UM_distinctid", "172cd13faab18a-0cc175a1ccab1-3c3f5a0c-1fa400-172cd13faac61e")
// .cookie("__dxca", "7959aeaa-2cb3-44b4-ac6d-5a19f2ac1261")
// .cookie("lv", "0")
// .cookie("chaoxinguser", "1")
// .cookie("uname", "")
// .cookie("_uid", "143680086")
// .cookie("uf", "f9866f9a46b70622b0dee5ad98478a8391831cfec1654dd57a4fb73fc6375849f7893fe5a7d754cdff17dba0ec4416549b0594e13f4b452fbdd6b93a431584911471850d8bf7e34c24848f3d3f307228570bdddc4e9885c5")
// .cookie("_d", "1592928430294")
// .cookie("UID", "143680086")
// .cookie("vc", "A5267B905B437CE795E472D324AD65E0")
// .cookie("vc2", "C473E4E0C10364B7C0A31DE46070C411")
// .cookie("vc3", "C9JC%2Ff8LUHJ%2FbURcxo1vggUNgTmarTCVJFGMM3kP7MoczjjCxUPi89UGJjjLU4sQ6WNHQgZOUN1mj6sDYiMWNCcwVw%2BMLRe1D3y5d%2FofpoF2AMh3LToyTKUsC9ykkTfsWnWiDPL32C%2Bziy%2BnJFuaW5KB9%2FlDulddw%2B9thByI%2Bvg%3Daaeb61274732c450c6a62c27f9001cce")
// .cookie("xxtenc", "3fa0ee7c7d4f9d548e4bc1d7336c838c")
// .cookie("duxiu", "userName%5fdsr%2c%3dxbnlkjdx%2c%21userid%5fdsr%2c%3d1406%2c%21char%5fdsr%2c%3d%u78cb%2c%21metaType%2c%3d353%2c%21dsr%5ffrom%2c%3d1%2c%21logo%5fdsr%2c%3dlogo0408%2ejpg%2c%21logosmall%5fdsr%2c%3dsmall0408%2ejpg%2c%21title%5fdsr%2c%3d%u897f%u5317%u519c%u6797%u79d1%u6280%u5927%u5b66%2c%21url%5fdsr%2c%3debook%2c%21compcode%5fdsr%2c%3d1176%2c%21province%5fdsr%2c%3d%u9655%u897f%2c%21readDom%2c%3d0%2c%21isdomain%2c%3d61707%2c%21showcol%2c%3d0%2c%21hu%2c%3d0%2c%21uscol%2c%3d0%2c%21isfirst%2c%3d0%2c%21istest%2c%3d0%2c%21cdb%2c%3d0%2c%21og%2c%3d1%2c%21ogvalue%2c%3d7%2c%21testornot%2c%3d1%2c%21remind%2c%3d0%2c%21datecount%2c%3d3478%2c%21userIPType%2c%3d2%2c%21lt%2c%3d0%2c%21ttt%2c%3dfxlogin%2echaoxing%2c%21enc%5fdsr%2c%3dD5E78CB6EBD2A70090256E4C463A3566").get();} catch (IOException e) {return;}Elements key = doc1.select(".Fmian1 table tbody tr");System.out.println(i + 1 + ": " + pathes[i]);//【作者机构】//【来 源】//【分 类 号】//【分类导航】//【关 键 词】//【基 金】//【摘 要】//【统计数据】//【全文获取】for (Element e : key) {if (e.child(0).text().equals("【关 键 词】")) {for (int z = 0; z < e.child(1).children().size() && z < 8; z++) {keys[i][z] = e.child(1).children().get(z).text();//System.out.println(keys[i][z]);}}if (e.child(0).text().equals("【基 金】")) {funds[i] = e.child(1).text();//System.out.println(funds[i]);}if (e.child(0).text().equals("【作者机构】")) {if (e.child(1).children().size() == 0) {units[i] = "西北农林科技大学黄土高原土壤侵蚀与旱地农业国家重点实验室";} else {units[i] = e.child(1).child(0).text();}//System.out.println(units[i]);}if (e.child(0).text().equals("【摘 要】")) {abstracts[i] = e.child(1).text();//System.out.println(abstracts[i]);}//System.out.println(e.child(0).text());}LW lw = new LW();lw.setSubject(subjects.get(i + 1).child(1).text());lw.setAuthor(authors.get(i + 1).child(2).text());lw.setType("cssci");if (quotenums[i] != null) lw.setQuote_num(quotenums[i]);else lw.setQuote_num("");if (downloadnums[i] != null) lw.setDownload_num(downloadnums[i]);else lw.setDownload_num("");lw.setPeriodical(periodicals[i]);lw.setPublish_time(publishtimes[i]);if (keys[i][0] != null) lw.setKey(keys[i][0]);else lw.setKey("");if (keys[i][1] != null) lw.setKey1(keys[i][1]);else lw.setKey1("");if (keys[i][2] != null) lw.setKey2(keys[i][2]);else lw.setKey2("");if (keys[i][3] != null) lw.setKey3(keys[i][3]);else lw.setKey3("");if (keys[i][4] != null) lw.setKey4(keys[i][4]);else lw.setKey4("");if (keys[i][5] != null) lw.setKey5(keys[i][5]);else lw.setKey5("");if (keys[i][6] != null) lw.setKey6(keys[i][6]);else lw.setKey6("");if (keys[i][7] != null) lw.setKey7(keys[i][7]);else lw.setKey7("");if (funds[i] != null) lw.setFund(funds[i]);else lw.setFund("");if (units[i] != null) lw.setUnit(units[i]);else lw.setUnit("");lw.setAbstract1(abstracts[i]);lws.add(lw);LW.lws.add(lw);
// funds[i] = key.get(5).child(1).text();
// System.out.println(funds[i]);}for (int i = 0; i < 50; i++) {}return;}}
2.pachong.class
package com.example.util;import com.example.mapper.bookmapper;
import com.example.service.bookservice;
import com.example.service.serviceimpl.bookserviceimpl;
import org.apache.xmlbeans.impl.xb.xsdschema.Public;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.context.ApplicationContext;
import org.springframework.stereotype.Component;
import org.springframework.stereotype.Controller;
import org.springframework.stereotype.Service;
import org.springframework.test.context.junit4.SpringRunner;import javax.annotation.PostConstruct;
import javax.annotation.Resource;
import java.io.*;
import java.net.*;
import java.security.PublicKey;
import java.util.ArrayList;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@Service
public class pachong{@Autowiredbookmapper bm;public static void main(String[] args) throws IOException, InterruptedException {}public ArrayList<LW> palunwen(Integer q){String path="";ArrayList<LW>lws = new ArrayList<>();
// String [] ips = {"120.79.64.147","175.148.74.121","139.9.195.202","60.167.132.223"};
// String [] ports = {"8118","1133","8118","808"};
// System.getProperties().setProperty("http.proxyHost", ips[q%5]);
// System.getProperties().setProperty("http.proxyPort", ports[q%5]);
// System.err.println(ips[q%5]+":"+ports[q%5]);try {path = "http://qikan.chaoxing.com/searchjour?sw=" + URLEncoder.encode("西北农林科技大学", "utf-8") + "&stryear=13&nosim=1&size=50&x=0_900&pages="+String.valueOf(q);} catch (UnsupportedEncodingException e) {e.printStackTrace();}Random r = new Random();String[] ua = {"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586","Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko","Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7","Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0"};int k = r.nextInt(14);String ip = null;ip = ips1().get(0);Document doc = null;try {doc = Jsoup.connect(path).timeout(10000).ignoreHttpErrors(true).userAgent(ua[k])
// .cookie("msign_dsr", "1592578548328")
// .cookie("search_uuid", "59d60d20%2d888c%2d4339%2d893c%2d3f2df0b416a9")
// .cookie("UM_distinctid", "172cd13faab18a-0cc175a1ccab1-3c3f5a0c-1fa400-172cd13faac61e")
// .cookie("__dxca", "7959aeaa-2cb3-44b4-ac6d-5a19f2ac1261")
// .cookie("lv", "0")
// .cookie("chaoxinguser", "1")
// .cookie("uname", "")
// .cookie("_uid", "143680086")
// .cookie("uf", "f9866f9a46b70622b0dee5ad98478a8391831cfec1654dd57a4fb73fc6375849f7893fe5a7d754cdff17dba0ec4416549b0594e13f4b452fbdd6b93a431584911471850d8bf7e34c24848f3d3f307228570bdddc4e9885c5")
// .cookie("_d", "1592928430294")
// .cookie("UID", "143680086")
// .cookie("vc", "A5267B905B437CE795E472D324AD65E0")
// .cookie("vc2", "C473E4E0C10364B7C0A31DE46070C411")
// .cookie("vc3", "C9JC%2Ff8LUHJ%2FbURcxo1vggUNgTmarTCVJFGMM3kP7MoczjjCxUPi89UGJjjLU4sQ6WNHQgZOUN1mj6sDYiMWNCcwVw%2BMLRe1D3y5d%2FofpoF2AMh3LToyTKUsC9ykkTfsWnWiDPL32C%2Bziy%2BnJFuaW5KB9%2FlDulddw%2B9thByI%2Bvg%3Daaeb61274732c450c6a62c27f9001cce")
// .cookie("xxtenc", "3fa0ee7c7d4f9d548e4bc1d7336c838c")
// .cookie("duxiu", "userName%5fdsr%2c%3dxbnlkjdx%2c%21userid%5fdsr%2c%3d1406%2c%21char%5fdsr%2c%3d%u78cb%2c%21metaType%2c%3d353%2c%21dsr%5ffrom%2c%3d1%2c%21logo%5fdsr%2c%3dlogo0408%2ejpg%2c%21logosmall%5fdsr%2c%3dsmall0408%2ejpg%2c%21title%5fdsr%2c%3d%u897f%u5317%u519c%u6797%u79d1%u6280%u5927%u5b66%2c%21url%5fdsr%2c%3debook%2c%21compcode%5fdsr%2c%3d1176%2c%21province%5fdsr%2c%3d%u9655%u897f%2c%21readDom%2c%3d0%2c%21isdomain%2c%3d61707%2c%21showcol%2c%3d0%2c%21hu%2c%3d0%2c%21uscol%2c%3d0%2c%21isfirst%2c%3d0%2c%21istest%2c%3d0%2c%21cdb%2c%3d0%2c%21og%2c%3d1%2c%21ogvalue%2c%3d7%2c%21testornot%2c%3d1%2c%21remind%2c%3d0%2c%21datecount%2c%3d3478%2c%21userIPType%2c%3d2%2c%21lt%2c%3d0%2c%21ttt%2c%3dfxlogin%2echaoxing%2c%21enc%5fdsr%2c%3dD5E78CB6EBD2A70090256E4C463A3566").get();} catch (IOException e) {return lws;}Elements subjects = doc.select("#liebiaoDivId tr");Elements authors = doc.select("#liebiaoDivId tr");Elements downloadnum = doc.select("#liebiaoDivId tr");Elements quotenum = doc.select("#liebiaoDivId tr");String[] pathes = new String[50];String[] quotenums = new String[50];String[] downloadnums = new String[50];String[] periodicals = new String[50];String[] publishtimes = new String[50];String[] funds = new String[50];String[][] keys = new String[50][8];String[] units = new String[50];String[] abstracts = new String[50];for (int i = 0; i < 50; i++) {pathes[i] = "http://qikan.chaoxing.com" + subjects.get(i + 1).child(1).child(0).attr("href").trim();quotenums[i] = quotenum.get(i + 1).child(6).child(0).text();downloadnums[i] = downloadnum.get(i + 1).child(7).child(0).text();periodicals[i] = downloadnum.get(i + 1).child(3).child(0).text();if (downloadnum.get(i + 1).child(5).child(0).text().length() == 8) {publishtimes[i] = downloadnum.get(i + 1).child(5).child(0).text().substring(0, 4) + "-0" + String.valueOf(Integer.parseInt(String.valueOf(downloadnum.get(i + 1).child(5).child(0).text().charAt(6))));} else if (downloadnum.get(i + 1).child(5).child(0).text().length() > 8&&(Integer.valueOf(downloadnum.get(i + 1).child(5).child(0).text().charAt(6))<=9)&&(Integer.valueOf(downloadnum.get(i + 1).child(5).child(0).text().charAt(6))>=0)) {publishtimes[i] = downloadnum.get(i + 1).child(5).child(0).text().substring(0, 4) + "-" + String.valueOf(Integer.valueOf(downloadnum.get(i + 1).child(5).child(0).text().substring(6, 8)) / 2);}else{publishtimes[i] = "1";}//System.out.println(publishtimes[i]);//System.out.println(periodicals[i]);//System.out.println(downloadnums[i]);//System.out.println(quotenums[i]);//System.out.println(pathes[i]);}int num=0;ArrayList<String> ipa = ips1();for (int i = 0; i < 50; i++) {if(num>195){ipa = ips1();}if(i%10==0){ip = ipa.get(num);num++;String[] r1 = ip.split(":");System.out.println(ip);System.getProperties().setProperty("http.proxyHost", r1[0]);System.getProperties().setProperty("http.proxyPort", r1[1]);System.err.println(r1[0]+":"+r1[1]);}Document doc1 = null;try {doc1 = Jsoup.connect(pathes[i]).timeout(10000).ignoreHttpErrors(true).userAgent(ua[k])
// .cookie("msign_dsr", "1592578548328")
// .cookie("search_uuid", "59d60d20%2d888c%2d4339%2d893c%2d3f2df0b416a9")
// .cookie("UM_distinctid", "172cd13faab18a-0cc175a1ccab1-3c3f5a0c-1fa400-172cd13faac61e")
// .cookie("__dxca", "7959aeaa-2cb3-44b4-ac6d-5a19f2ac1261")
// .cookie("lv", "0")
// .cookie("chaoxinguser", "1")
// .cookie("uname", "")
// .cookie("_uid", "143680086")
// .cookie("uf", "f9866f9a46b70622b0dee5ad98478a8391831cfec1654dd57a4fb73fc6375849f7893fe5a7d754cdff17dba0ec4416549b0594e13f4b452fbdd6b93a431584911471850d8bf7e34c24848f3d3f307228570bdddc4e9885c5")
// .cookie("_d", "1592928430294")
// .cookie("UID", "143680086")
// .cookie("vc", "A5267B905B437CE795E472D324AD65E0")
// .cookie("vc2", "C473E4E0C10364B7C0A31DE46070C411")
// .cookie("vc3", "C9JC%2Ff8LUHJ%2FbURcxo1vggUNgTmarTCVJFGMM3kP7MoczjjCxUPi89UGJjjLU4sQ6WNHQgZOUN1mj6sDYiMWNCcwVw%2BMLRe1D3y5d%2FofpoF2AMh3LToyTKUsC9ykkTfsWnWiDPL32C%2Bziy%2BnJFuaW5KB9%2FlDulddw%2B9thByI%2Bvg%3Daaeb61274732c450c6a62c27f9001cce")
// .cookie("xxtenc", "3fa0ee7c7d4f9d548e4bc1d7336c838c")
// .cookie("duxiu", "userName%5fdsr%2c%3dxbnlkjdx%2c%21userid%5fdsr%2c%3d1406%2c%21char%5fdsr%2c%3d%u78cb%2c%21metaType%2c%3d353%2c%21dsr%5ffrom%2c%3d1%2c%21logo%5fdsr%2c%3dlogo0408%2ejpg%2c%21logosmall%5fdsr%2c%3dsmall0408%2ejpg%2c%21title%5fdsr%2c%3d%u897f%u5317%u519c%u6797%u79d1%u6280%u5927%u5b66%2c%21url%5fdsr%2c%3debook%2c%21compcode%5fdsr%2c%3d1176%2c%21province%5fdsr%2c%3d%u9655%u897f%2c%21readDom%2c%3d0%2c%21isdomain%2c%3d61707%2c%21showcol%2c%3d0%2c%21hu%2c%3d0%2c%21uscol%2c%3d0%2c%21isfirst%2c%3d0%2c%21istest%2c%3d0%2c%21cdb%2c%3d0%2c%21og%2c%3d1%2c%21ogvalue%2c%3d7%2c%21testornot%2c%3d1%2c%21remind%2c%3d0%2c%21datecount%2c%3d3478%2c%21userIPType%2c%3d2%2c%21lt%2c%3d0%2c%21ttt%2c%3dfxlogin%2echaoxing%2c%21enc%5fdsr%2c%3dD5E78CB6EBD2A70090256E4C463A3566").get();} catch (IOException e) {return lws;}Elements key = doc1.select(".Fmian1 table tbody tr");System.out.println(i+1+": "+pathes[i]);//【作者机构】//【来 源】//【分 类 号】//【分类导航】//【关 键 词】//【基 金】//【摘 要】//【统计数据】//【全文获取】for(Element e:key){if(e.child(0).text().equals("【关 键 词】")){for (int z = 0; z < e.child(1).children().size() && z < 8; z++) {keys[i][z] = e.child(1).children().get(z).text();//System.out.println(keys[i][z]);}}if(e.child(0).text().equals("【基 金】")){funds[i] = e.child(1).text();//System.out.println(funds[i]);}if(e.child(0).text().equals("【作者机构】")){if(e.child(1).children().size()==0){units[i] = "西北农林科技大学黄土高原土壤侵蚀与旱地农业国家重点实验室";}else{units[i] = e.child(1).child(0).text();}//System.out.println(units[i]);}if(e.child(0).text().equals("【摘 要】")){abstracts[i] = e.child(1).text();//System.out.println(abstracts[i]);}//System.out.println(e.child(0).text());}
// funds[i] = key.get(5).child(1).text();
// System.out.println(funds[i]);}for (int i = 0; i < 50; i++) {LW lw = new LW();lw.setSubject(subjects.get(i + 1).child(1).text());lw.setAuthor(authors.get(i + 1).child(2).text());lw.setType("cssci");if (quotenums[i] != null) lw.setQuote_num(quotenums[i]);else lw.setQuote_num("");if (downloadnums[i] != null) lw.setDownload_num(downloadnums[i]);else lw.setDownload_num("");lw.setPeriodical(periodicals[i]);lw.setPublish_time(publishtimes[i]);if (keys[i][0] != null) lw.setKey(keys[i][0]);else lw.setKey("");if (keys[i][1] != null) lw.setKey1(keys[i][1]);else lw.setKey1("");if (keys[i][2] != null) lw.setKey2(keys[i][2]);else lw.setKey2("");if (keys[i][3] != null) lw.setKey3(keys[i][3]);else lw.setKey3("");if (keys[i][4] != null) lw.setKey4(keys[i][4]);else lw.setKey4("");if (keys[i][5] != null) lw.setKey5(keys[i][5]);else lw.setKey5("");if (keys[i][6] != null) lw.setKey6(keys[i][6]);else lw.setKey6("");if (keys[i][7] != null) lw.setKey7(keys[i][7]);else lw.setKey7("");if (funds[i] != null) lw.setFund(funds[i]);else lw.setFund("");if (units[i] != null) lw.setUnit(units[i]);else lw.setUnit("");lw.setAbstract1(abstracts[i]);lws.add(lw);}return lws;}public ArrayList<String> ips() throws IOException {String path = "http://api.xiequ.cn/VAD/GetIp.aspx?act=get&num=100&time=30&plat=1&re=1&type=0&so=1&ow=1&spl=1&addr=&db=1";// 要获得html页面内容的地址URL url = new URL(path);// 创建url对象HttpURLConnection conn = (HttpURLConnection) url.openConnection();// 打开连接conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");conn.setRequestProperty("contentType", "GBK"); // 设置url中文参数编码conn.setConnectTimeout(5 * 1000);// 请求的时间conn.setRequestMethod("GET");// 请求方式InputStream inStream = conn.getInputStream();// readLesoSysXML(inStream);BufferedReader in = new BufferedReader(new InputStreamReader(inStream, "GBK"));StringBuffer buffer = new StringBuffer();ArrayList<String> ipp = new ArrayList<String>();String line = "";// 读取获取到内容的最后一行,写入while ((line = in.readLine()) != null) {buffer.append(line);ipp.add(line);System.out.println(line);}String str = buffer.toString();
// JSONObject json1 = JSONObject.parseObject(str);
// JSONArray jsons = JSONArray.parseArray(json1.get("data").toString());// for(Object json:jsons){// JSONObject ips = JSONObject.parseObject(json.toString());
// String ip = ips.get("IP").toString();
// System.out.println(ip);
// ipp.add(ip);
// }return ipp;}public ArrayList<String> ips1(){String path = "http://119.45.8.232/Api/?k=NRAX9VIN451I35QJP36SEM&num=1&type=1&f=1&repeat=1&respone=0&ptn=1";// 要获得html页面内容的地址URL url = null;// 创建url对象try {url = new URL(path);} catch (MalformedURLException e) {ArrayList<String>s = new ArrayList<>();s.add("1.1.1.1:80");return s;}HttpURLConnection conn = null;// 打开连接try {conn = (HttpURLConnection) url.openConnection();} catch (IOException e) {ArrayList<String>s = new ArrayList<>();s.add("58.218.200.248:22069");return s;}conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");conn.setRequestProperty("contentType", "utf-8"); // 设置url中文参数编码conn.setConnectTimeout(5 * 1000);// 请求的时间try {conn.setRequestMethod("GET");// 请求方式} catch (ProtocolException e) {ArrayList<String>s = new ArrayList<>();s.add("1.1.1.1:80");return s;}InputStream inStream = null;try {inStream = conn.getInputStream();} catch (IOException e) {ArrayList<String>s = new ArrayList<>();s.add("1.1.1.1:80");return s;}// readLesoSysXML(inStream);BufferedReader in = null;try {in = new BufferedReader(new InputStreamReader(inStream, "utf-8"));} catch (UnsupportedEncodingException e) {ArrayList<String>s = new ArrayList<>();s.add("1.1.1.1:80");return s;}StringBuffer buffer = new StringBuffer();ArrayList<String> ipp = new ArrayList<String>();String line = "";// 读取获取到内容的最后一行,写入while (true) {try {if (!((line = in.readLine()) != null)) break;} catch (IOException e) {ArrayList<String>s = new ArrayList<>();s.add("1.1.1.1:80");return s;}buffer.append(line);ipp.add(line);System.out.println(line);}String str = buffer.toString();
// JSONObject json1 = JSONObject.parseObject(str);
// JSONArray jsons = JSONArray.parseArray(json1.get("data").toString());// for(Object json:jsons){// JSONObject ips = JSONObject.parseObject(json.toString());
// String ip = ips.get("IP").toString();
// System.out.println(ip);
// ipp.add(ip);
// }return ipp;}public void xiancheng() throws IOException, InterruptedException {ExecutorService pool = Executors.newCachedThreadPool();// ArrayList<String>ips = new ArrayList<>();
// try {// File file = new File("C:/IMAGES/ip.txt");
// BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
// String strLine = null;
// int lineCount = 1;
// while(null != (strLine = bufferedReader.readLine())){// System.err.println(strLine);
// ips.add(strLine);
// lineCount++;
// }
// }catch(Exception e){// e.printStackTrace();
// }ArrayList ips = ips();for (int i = 0; i < 1500; i++) {xiancheng xian = new xiancheng(ips, i);xian.run();//pool.execute(xian);for(LW lw:xian.lws) {if (!bm.searchlw(lw.getSubject())) {if (bm.lwcount() == null) {lw.setId(String.valueOf("1"));} else {lw.setId(String.valueOf(Integer.valueOf(bm.lwcount()) + 1));}}bm.insertlw(lw);System.out.println(lw.toString());}}}
}
java多线程爬论文相关推荐
- Java多线程爬取豆瓣排行榜Top250(maven)
总体设计 分析网页 确定需要用到的jar包,并通过pom.xml进行配置 创建一个电影实体类(Film),写入要爬取电影的相关属性,并用set和get方法封装 编写一个多线程爬取Top250的电影程序 ...
- java 爬取图片_使用Java多线程爬取网站图片
使用Java爬取网站的图片并保存至本地 使用maven导入依赖org.jsoup jsoup 1.11.2 实现代码:import org.jsoup.HttpStatusException; imp ...
- java多线程-爬电影天堂上的电影下载地址
获取网页html 刚开始做的时候,在网上搜了一下资料.然后找到了一个获取网页最简单的dome,如下. public static String getHtml(String urlstring) th ...
- java多线程爬取养眼福利图片(懂得都懂,带注释)
上次做了个python版的福利爬虫,不过咱主业还是java,所以再发个java版本的,附带每一步的注释,又看不懂的地方欢迎提问首先导入jar包,我这里用的maven,没有maven的可以自己在网上下载 ...
- java 多线程爬取网页,利用jsoup爬取百度网盘资源分享连接(多线程)
突然有一天就想说能不能用某种方法把百度网盘上分享的资源连接抓取下来,于是就动手了.知乎上有人说过最好的方法就是http://pan.baidu.com/wap抓取,一看果然链接后面的uk值是一串数字, ...
- java多线程爬图书图片
学习笔记仅供参考 1.xiancheng.class package com.example.util;import com.example.service.bookservice; import o ...
- java多线程做一件事_关于Java的十件事
java多线程做一件事 那么,您从一开始就一直在使用Java? 还记得那些被称为" Oak"的日子,OO仍然是热门话题,C ++人士认为Java没有机会,Applet还是一件事吗? ...
- Java多线程网络爬虫(时光网为例)
目录 多线程简介 多线程网络爬虫 分析要爬的数据 网络抓包 爬虫框架 model MtimeThread主方法 MtimeParse解析数据 数据库操作 多线程简介 Java多线程实现方式主要有三种: ...
- java程序设计 论文_《JAVA程序设计》论文要求
<JAVA程序设计>论文要求 1. 文字要求:2000字到3000字之间 2. 打印稿: 1) 纸张大小:A4 2) 题目:黑体,2号字 3) 班级 姓名 学号 EMAIL(题目下方):宋 ...
- java 多线程重温
之前犯懒一直没有开过博客,然后一直学一直丢一直在重新学...重新学习的过程中找资源是很艰难的,所以今天开通了博客,希望能每天进步一点点,每天更快乐一点点. ---------------------- ...
最新文章
- Hash算法解决冲突的四种方法
- 阿里 P7 内功修炼法则
- python基础语法总结-Python基础语法总结之逻辑运算
- VS_自动添加头文件
- FPGA 实验六 计数器、 ROM和DDS
- 【H2 Database】导出CSV
- leetcode1247. 交换字符使得字符串相同(贪心)
- 单片机Proteus7.8仿真和Proteus8.6仿真 LED点阵 温度采集 电子琴 温度报警 电子秤 音乐播放器 PWM 电压表 温度计 交通灯
- golang配置环境排坑
- android studio 修改文件后出现类型转换错误
- Android系统音量范围0-100,步进值5(十三)
- jq+layui之保存修改的表状态
- systemback Linux 系统备份、迁移
- 闽什么什么院第二课堂网课破解-----微信内置浏览器
- 人月神话札记:未雨绸缪
- java过滤器定义_Java ---Filter过滤器
- 关于IE7半透明背景问题
- 永中java的窗口_永中國際 Office
- linux系统安装(Ubuntu 16.04.5)
- 上海产品运营招聘面试技巧分析
热门文章
- 2022-2027年中国康复机器人行业市场全景评估及发展战略规划报告
- Flink JVM 内存超限的分析方法总结
- CSU 1684-Disastrous Downtime(set+二分)
- 给学计算机的人的肺腑之言
- 智能微型断路器;智慧用电在线监控装置;故障电弧探测器在金融行业中的应用-安科瑞黄安南
- 学校头歌作业2_2计算弓形的表面积与体积(头歌作业[Python])
- 【CTF基础】RSA在多项式情况下的应用——watevrCTF 2019题目Swedish RSA学习
- Largest Redis Clusters Ever
- CELF(Cost-Effective Lazy Forward selection)具有成本效益的惰性前向选择算法
- haizei c++ 试听课程知识点 day1