import com.dderp.common.api.BusinessExecutor import com.dderp.common.datas.ERPModule import com.dderp.common.entity.base.ProcessStringItem import com.sweetfish.service.RetResult import org.apache.commons.lang3.StringUtils import org.apache.logging.log4j.LogManager import org.apache.logging.log4j.Logger import org.jsoup.Jsoup import org.jsoup.nodes.Document import org.jsoup.nodes.Element import org.jsoup.select.Elements import org.jsoup.select.Evaluator import javax.annotation.Resource import java.util.regex.Pattern /** * Created by jlutt on 2022-07-11 * 行政区划更新 * 国家统计局数据 * @author jlutt */ @SuppressWarnings(["HttpUrlsUsage", 'unused']) class BE_MapAddressUpdate2 implements BusinessExecutor { protected final Logger logger = LogManager.getLogger(this.getClass().getSimpleName()) //2023-08-29改为从国家统计局获取数据,民政局数据少东西 @Resource(name = "APP_HOME") protected String appHome @Override String scriptName() { return "行政区划更新V2" } @Override ERPModule module() { return ERPModule.ADDRESSPARSER } def provinceList = [] def cityList = [] def countryList = [] @Override RetResult execute(ProcessStringItem source) { provinceList.clear() cityList.clear() countryList.clear() try { getProvinces() provinceList.each { p -> getCities(p["code"] as String) } cityList.each { c -> getCounties(c["parentCode"] as String, c["code"] as String) } new File(appHome + File.separator + "conf" + File.separator + "addressdata", 'provices.json').withWriter('utf-8') { writer -> writer.writeLine '[' provinceList.eachWithIndex { p, idx -> writer.writeLine ' {' writer.writeLine ' "code": "' + p["fullCode"] + '",' writer.writeLine ' "name": "' + p["name"] + '"' writer.writeLine ' }' + (idx != provinceList.size() - 1 ? ',' : "") } writer.writeLine(']') } new File(appHome + File.separator + "conf" + File.separator + "addressdata", 'cities.json').withWriter('utf-8') { writer -> writer.writeLine '[' cityList.eachWithIndex { p, idx -> writer.writeLine ' {' writer.writeLine ' "code": "' + p["fullCode"] + '",' writer.writeLine ' "name": "' + p["name"] + '"' writer.writeLine ' }' + (idx != cityList.size() - 1 ? ',' : "") } writer.writeLine(']') } new File(appHome + File.separator + "conf" + File.separator + "addressdata", 'counties.json').withWriter('utf-8') { writer -> writer.writeLine '[' countryList.eachWithIndex { p, idx -> writer.writeLine ' {' writer.writeLine ' "code": "' + p["fullCode"] + '",' writer.writeLine ' "name": "' + p["name"] + '"' writer.writeLine ' }' + (idx != countryList.size() - 1 ? ',' : "") } writer.writeLine(']') } } catch (IOException e) { e.printStackTrace() } return RetResult. successT().result(ProcessStringItem.newBuilder().itemValue("").build()) } private Elements getElements(String url, Evaluator evaluator) { try { Document document = Jsoup.connect(url) .header("Accept", "*/*") .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188") .maxBodySize(0) .followRedirects(true) .timeout(10000) .get() Elements elements = document.select(evaluator) // HTTP请求太频繁会导致报错“HTTP error fetching URL. Status=502, URL=……”或“Too many redirects occurred trying to load URL……”,这里暂停几秒再继续下一次HTTP请求 Thread.sleep(2 * 1000) // logger.info(url + "符合条件的元素数:" + elements.size()) return elements } catch (Exception e) { logger.error(url + ",执行出错:" + e) try { // 请求出错(如:Read timed out、502)后等待一段时间后再尝试,规避因同一IP频繁请求被限制问题 Thread.sleep(30 * 60 * 1000) } catch (InterruptedException ex) { logger.error(url + ",执行出错:" + ex) } return null } } def getProvinces() { Elements elements = getElements("http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/index.html", new Evaluator.AttributeWithValueEnding("href", ".html")) elements.each { element -> String href = element.attributes().get("href") if (href.matches("\\d+\\.html")) { String code = href.replaceAll("\\.html", "") def province = [ code : code, fullCode : StringUtils.rightPad(code, 6, "0"), name : element.text(), parentCode: "", parentName: "" ] // logger.info(province.code + "=" + province.fullCode + "=" + province.name) provinceList << province } } } def getCities(String provinceCode) { Pattern pattern = Pattern.compile(provinceCode + "/\\d+\\.html") Elements elements = getElements("http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/" + provinceCode + ".html", new Evaluator.AttributeWithValueMatching("href", pattern)) for (Element element : elements) { String href = element.attributes().get("href") String text = element.text() if (pattern.matcher(href).matches()) { String code = href.replaceAll(provinceCode + "/", "").replaceAll("\\.html", "") if (!text.matches("\\d+")) { def city = [ code : code, fullCode : StringUtils.rightPad(code, 6, "0"), name : element.text(), parentCode: provinceCode ] // logger.info(city.code + "=" + city.fullCode + "=" + city.name) cityList << city } } } } def getCounties(String provinceCode, String cityCode) { Pattern pattern = Pattern.compile("\\d+/\\d+\\.html") String url = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/" + provinceCode + "/" + cityCode + ".html" Document document = Jsoup.connect(url) .header("Accept", "*/*") .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188") .maxBodySize(0) .followRedirects(true) .timeout(10000) .get() Elements elements1 = document.select(("tr.countytr td:contains(市辖区)")) if (!elements1.isEmpty()) { Element td1 = elements1.first() String countyCode = StringUtils.left(td1.parent().select("td").get(0).text(), 6) def country = [ code : countyCode, fullCode : countyCode, name : "市辖区", parentCode: cityCode ] countryList << country } Elements elements = document.select(new Evaluator.AttributeWithValueMatching("href", pattern)) for (Element element : elements) { String href = element.attributes().get("href") String text = element.text() if (pattern.matcher(href).matches()) { String code = href.replaceAll("\\d+/", "").replaceAll("\\.html", "") if (!text.matches("\\d+")) { def country = [ code : code, fullCode : StringUtils.rightPad(code, 6, "0"), name : element.text(), parentCode: cityCode ] // logger.info(country.code + "=" + country.fullCode + "=" + country.name) countryList << country } } } Thread.sleep(2 * 1000) } }