提交 c1cfd468 authored 作者: 梁业锦's avatar 梁业锦 💬

修复H&M的爬虫BUG

上级 0a6f3bb8
......@@ -4,11 +4,10 @@ import com.diaoyun.zion.chinafrica.bis.IItemSpider;
import com.diaoyun.zion.chinafrica.enums.PlatformEnum;
import com.diaoyun.zion.chinafrica.vo.*;
import com.diaoyun.zion.master.util.HttpClientUtil;
import com.diaoyun.zion.master.util.JsoupUtil;
import com.diaoyun.zion.master.util.SpiderUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
......@@ -18,11 +17,12 @@ import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.math.BigDecimal;
import java.net.URISyntaxException;
import java.util.*;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate;
......@@ -37,13 +37,16 @@ public class HmSpider implements IItemSpider {
/**
* H&M 数据格式化
*
* @param targetUrl
* @return
*/
@Override
public JSONObject captureItem(String targetUrl) throws URISyntaxException, IOException, ExecutionException, InterruptedException, TimeoutException {
String[] spilt = targetUrl.split("productpage.");
targetUrl = "https://www2.hm.com/zh_cn/productpage." + spilt[1];
Pattern pattern = Pattern.compile("\\d+");
Matcher matcher = pattern.matcher(targetUrl);
matcher.find();
targetUrl = "https://www.hm.com.cn/en_cn/" + matcher.group() + ".html";
String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.HM.getValue());
ProductResponse productResponse = formatProductResponse(content);
JSONObject resultObj = JSONObject.fromObject(productResponse);
......@@ -54,23 +57,12 @@ public class HmSpider implements IItemSpider {
/**
* 格式化返回数据
* TODO 存在把不在页面上显示的颜色的尺码也算了进去
*
* @param content 页面数据
* @return 格式化后的数据
*/
public static ProductResponse formatProductResponse(String content) {
// 获取主要数据并将转换 Json 数据及 Document 对象
String detailStr = JsoupUtil.getScriptContent(content, "productArticleDetails");
int firstBrackets = detailStr.indexOf("{");
int lastbrackets = detailStr.lastIndexOf("}");
String resultStr = detailStr.substring(firstBrackets, lastbrackets + 1);
resultStr = resultStr.replaceAll("\'", "\"")
.replaceAll("\"image\": isDesktop [?] ", "")
.replaceAll("\"fullscreen\": isDesktop [?] ", "")
.replaceAll("\"zoom\": isDesktop [?] ", "")
.replaceAll("isDesktop [?] \"//www2.hm.com/\" : ","");
JSONObject dataMap = JSONObject.fromObject(resultStr);
public static ProductResponse formatProductResponse(String content) throws IOException, URISyntaxException {
Document document = Jsoup.parse(content);
// 声明封装类
ProductResponse productResponse = new ProductResponse();
// 含有商品的属性,设置为true
......@@ -92,33 +84,43 @@ public class HmSpider implements IItemSpider {
//////////////////////////////////// 获取商品基本信息 ////////////////////////////
itemInfo.setShopName("H&M");
itemInfo.setShopUrl("https://www2.hm.com/");
itemInfo.setItemId(document.select("div[class=article-code]").select("li").text());
itemInfo.setTitle(document.select("h1[class=primary product-item-headline]").text());
itemInfo.setShopUrl("https://www.hm.com.cn/");
itemInfo.setItemId(document.select("div[class=product-info-main]").select("a").attr("data-product-id"));
itemInfo.setTitle(document.select("meta[name=title]").attr("content"));
//////////////////////////////////// 获取商品基本信息(图片下取)End /////////////////////////
// 获取原始价
String fullPrice = document.select("div[class=primary-row product-item-price]").text();
String fullPrice = document.select("meta[property=product:price:amount]").attr("content");
fullPrice = SpiderUtil.retainNumber(fullPrice);
// TODO 转换汇率,目前商品单位是人民币
fullPrice = exchangeRate(fullPrice);
BigDecimal priceOld = new BigDecimal(fullPrice);
BigDecimal div = new BigDecimal("100");
fullPrice = priceOld.divide(div, 2, BigDecimal.ROUND_DOWN).toString();
//////////////////////////////////// 获取商品颜色属性 ////////////////////////////////////////////
// 取页面的数据
Elements colorEle = document.select("div[class=mini-slider]").select("ul[class=inputlist clearfix]").select("li");
Elements colorEle = document.select("div[id=article-list-owl]").select("div");
List<String> urlList = new ArrayList<>();
for (Element element : colorEle) {
if (!StringUtils.isEmpty(element.attr("data-product-url"))) {
// 获取其他商品的链接
urlList.add(element.attr("data-product-url"));
}
}
for (String url : urlList) {
content = HttpClientUtil.getContentByUrl(url, PlatformEnum.HM.getValue());
document = Jsoup.parse(content);
String colorNo = element.select("a").attr("data-articlecode");
String color = element.select("a").attr("data-color");
String imgUrl = "http:" + element.select("noscript").attr("data-src");
colorEle = document.select("span[class=current-article-title]");
String color = colorEle.attr("data-title");
String imgUrl = document.select("meta[property=og:image]").attr("content");
itemInfo.setPic(imgUrl);
ProductProp productPropColor = new ProductProp();
productPropColor.setPropId(colorNo);
productPropColor.setPropId(color);
productPropColor.setPropName(color);
productPropColor.setImage(imgUrl);
propSet.add(productPropColor);
......@@ -132,16 +134,12 @@ public class HmSpider implements IItemSpider {
//////////////////////////////////// 获取商品颜色属性 END ////////////////////////////////////////////
///////////////////////// 获取商品尺码属性 ///////////////////////////////////////////////////////////
// TODO 这里好像出了点问题。。。
JSONArray sizeArr = dataMap.getJSONObject(colorNo).getJSONArray("sizes");
for (int i = 0; i < sizeArr.size(); i++) {
JSONObject sizeObj = sizeArr.getJSONObject(i);
String size = sizeObj.getString("name");
String sizeNo = sizeObj.getString("sizeCode");
Elements sizeEles = document.select("div[class=picker-content]").select("div[class=size-option ]").select("span[class=variant-size-value]");
for (Element sizeEle : sizeEles) {
String size = sizeEle.text();
ProductProp productPropSize = new ProductProp();
productPropSize.setPropId(sizeNo);
productPropSize.setPropId(size);
productPropSize.setPropName(size);
sizePropSet.add(productPropSize);
if (productPropSet.get("尺码") == null) {
......@@ -154,8 +152,7 @@ public class HmSpider implements IItemSpider {
///////////////////////// 获取商品尺码属性END //////////////////////////////////////////////////////
// 设置 skuStr
String skuStr = ";" + colorNo + ";" + sizeNo + ";";
String skuStr = ";" + color + ";" + size + ";";
//////////////////////////////////// 获取库存 ////////////////////////////////////////////
if (productSkuStockList == null) {
productSkuStockList = new ArrayList<>();
......@@ -176,6 +173,8 @@ public class HmSpider implements IItemSpider {
productResponse.setPrice(fullPrice);
productResponse.setSalePrice(fullPrice + "-" + fullPrice);
//////////////////////////////////// 获取原始价 END//////////////////////////////////
}
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论