| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226 |
- import com.yinjie.heating.common.api.BusinessExecutor
- import com.yinjie.heating.common.datas.ERPModule
- import com.yinjie.heating.common.entity.base.ProcessStringItem
- import com.sweetfish.service.RetResult
- import org.apache.commons.lang3.StringUtils
- import org.apache.logging.log4j.LogManager
- import org.apache.logging.log4j.Logger
- import org.jsoup.Jsoup
- import org.jsoup.nodes.Document
- import org.jsoup.nodes.Element
- import org.jsoup.select.Elements
- import org.jsoup.select.Evaluator
- import javax.annotation.Resource
- import java.util.regex.Pattern
- /**
- * Created by jlutt on 2022-07-11
- * 行政区划更新
- * 国家统计局数据
- * @author jlutt
- */
- @SuppressWarnings(["HttpUrlsUsage", 'unused'])
- class BE_MapAddressUpdate2 implements BusinessExecutor<ProcessStringItem, ProcessStringItem> {
- protected final Logger logger = LogManager.getLogger(this.getClass().getSimpleName())
- //2023-08-29改为从国家统计局获取数据,民政局数据少东西
- @Resource(name = "APP_HOME")
- protected String appHome
- @Override
- String scriptName() {
- return "行政区划更新V2"
- }
- @Override
- ERPModule module() {
- return ERPModule.ADDRESSPARSER
- }
- def provinceList = []
- def cityList = []
- def countryList = []
- @Override
- RetResult<ProcessStringItem> execute(ProcessStringItem source) {
- provinceList.clear()
- cityList.clear()
- countryList.clear()
- try {
- getProvinces()
- provinceList.each { p ->
- getCities(p["code"] as String)
- }
- cityList.each { c ->
- getCounties(c["parentCode"] as String, c["code"] as String)
- }
- new File(appHome + File.separator + "conf" + File.separator + "addressdata", 'provices.json').withWriter('utf-8') { writer ->
- writer.writeLine '['
- provinceList.eachWithIndex { p, idx ->
- writer.writeLine ' {'
- writer.writeLine ' "code": "' + p["fullCode"] + '",'
- writer.writeLine ' "name": "' + p["name"] + '"'
- writer.writeLine ' }' + (idx != provinceList.size() - 1 ? ',' : "")
- }
- writer.writeLine(']')
- }
- new File(appHome + File.separator + "conf" + File.separator + "addressdata", 'cities.json').withWriter('utf-8') { writer ->
- writer.writeLine '['
- cityList.eachWithIndex { p, idx ->
- writer.writeLine ' {'
- writer.writeLine ' "code": "' + p["fullCode"] + '",'
- writer.writeLine ' "name": "' + p["name"] + '"'
- writer.writeLine ' }' + (idx != cityList.size() - 1 ? ',' : "")
- }
- writer.writeLine(']')
- }
- new File(appHome + File.separator + "conf" + File.separator + "addressdata", 'counties.json').withWriter('utf-8') { writer ->
- writer.writeLine '['
- countryList.eachWithIndex { p, idx ->
- writer.writeLine ' {'
- writer.writeLine ' "code": "' + p["fullCode"] + '",'
- writer.writeLine ' "name": "' + p["name"] + '"'
- writer.writeLine ' }' + (idx != countryList.size() - 1 ? ',' : "")
- }
- writer.writeLine(']')
- }
- } catch (IOException e) {
- e.printStackTrace()
- }
- return RetResult.<ProcessStringItem> successT().result(ProcessStringItem.newBuilder().itemValue("").build())
- }
- private Elements getElements(String url, Evaluator evaluator) {
- try {
- Document document = Jsoup.connect(url)
- .header("Accept", "*/*")
- .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188")
- .maxBodySize(0)
- .followRedirects(true)
- .timeout(10000)
- .get()
- Elements elements = document.select(evaluator)
- // HTTP请求太频繁会导致报错“HTTP error fetching URL. Status=502, URL=……”或“Too many redirects occurred trying to load URL……”,这里暂停几秒再继续下一次HTTP请求
- Thread.sleep(2 * 1000)
- // logger.info(url + "符合条件的元素数:" + elements.size())
- return elements
- } catch (Exception e) {
- logger.error(url + ",执行出错:" + e)
- try {
- // 请求出错(如:Read timed out、502)后等待一段时间后再尝试,规避因同一IP频繁请求被限制问题
- Thread.sleep(30 * 60 * 1000)
- } catch (InterruptedException ex) {
- logger.error(url + ",执行出错:" + ex)
- }
- return null
- }
- }
- def getProvinces() {
- Elements elements = getElements("http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/index.html",
- new Evaluator.AttributeWithValueEnding("href", ".html"))
- elements.each { element ->
- String href = element.attributes().get("href")
- if (href.matches("\\d+\\.html")) {
- String code = href.replaceAll("\\.html", "")
- def province = [
- code : code,
- fullCode : StringUtils.rightPad(code, 6, "0"),
- name : element.text(),
- parentCode: "",
- parentName: ""
- ]
- // logger.info(province.code + "=" + province.fullCode + "=" + province.name)
- provinceList << province
- }
- }
- }
- def getCities(String provinceCode) {
- Pattern pattern = Pattern.compile(provinceCode + "/\\d+\\.html")
- Elements elements = getElements("http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/" + provinceCode + ".html",
- new Evaluator.AttributeWithValueMatching("href", pattern))
- for (Element element : elements) {
- String href = element.attributes().get("href")
- String text = element.text()
- if (pattern.matcher(href).matches()) {
- String code = href.replaceAll(provinceCode + "/", "").replaceAll("\\.html", "")
- if (!text.matches("\\d+")) {
- def city = [
- code : code,
- fullCode : StringUtils.rightPad(code, 6, "0"),
- name : element.text(),
- parentCode: provinceCode
- ]
- // logger.info(city.code + "=" + city.fullCode + "=" + city.name)
- cityList << city
- }
- }
- }
- }
- def getCounties(String provinceCode, String cityCode) {
- Pattern pattern = Pattern.compile("\\d+/\\d+\\.html")
- String url = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/" + provinceCode + "/" + cityCode + ".html"
- Document document = Jsoup.connect(url)
- .header("Accept", "*/*")
- .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188")
- .maxBodySize(0)
- .followRedirects(true)
- .timeout(10000)
- .get()
- Elements elements1 = document.select(("tr.countytr td:contains(市辖区)"))
- if (!elements1.isEmpty()) {
- Element td1 = elements1.first()
- String countyCode = StringUtils.left(td1.parent().select("td").get(0).text(), 6)
- def country = [
- code : countyCode,
- fullCode : countyCode,
- name : "市辖区",
- parentCode: cityCode
- ]
- countryList << country
- }
- Elements elements = document.select(new Evaluator.AttributeWithValueMatching("href", pattern))
- for (Element element : elements) {
- String href = element.attributes().get("href")
- String text = element.text()
- if (pattern.matcher(href).matches()) {
- String code = href.replaceAll("\\d+/", "").replaceAll("\\.html", "")
- if (!text.matches("\\d+")) {
- def country = [
- code : code,
- fullCode : StringUtils.rightPad(code, 6, "0"),
- name : element.text(),
- parentCode: cityCode
- ]
- // logger.info(country.code + "=" + country.fullCode + "=" + country.name)
- countryList << country
- }
- }
- }
- Thread.sleep(2 * 1000)
- }
- }
|