提交 edd3e9e1 authored 作者: zhengfg's avatar zhengfg

1、增加nike、gap 爬虫

2、对爬虫返回的数据结构进行调整
上级 591f8b9e
package com.diaoyun.zion.chinafrica.bis.impl;
import com.diaoyun.zion.chinafrica.bis.IItemSpider;
import com.diaoyun.zion.chinafrica.enums.PlatformEnum;
import com.diaoyun.zion.chinafrica.vo.ProductResponse;
import com.diaoyun.zion.master.util.*;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.http.message.BasicHeader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException;
/**
* Gap数据爬虫
*/
@Component("gapItemSpider")
public class GapItemSpider implements IItemSpider {
private static Logger logger = LoggerFactory.getLogger(GapItemSpider.class);
//Gap商品详情
private static final String gapUrl="https://apicn.gap.cn/gap/store/product/list/searchProductByCondition.do";
@Override
public JSONObject captureItem(String targetUrl) throws URISyntaxException, IOException, ExecutionException, InterruptedException, TimeoutException {
JSONObject resultObj;
//获取链接中的商品spuCode
String itemId= getItemId(targetUrl);
Map<String,Object> paramMap=new HashMap<>();
JSONArray conditionList=new JSONArray();
JSONObject valueObj =new JSONObject();
JSONObject condition =new JSONObject();
valueObj.put("key","style");
valueObj.put("valueType","basic");
valueObj.put("value",new String [] {itemId});
conditionList.add(valueObj);
condition.put("conditionList",conditionList);
paramMap.put("data",condition);
//获取请求结果
String content = HttpClientUtil.sendPostWithBodyParameter(gapUrl,paramMap);
resultObj=JSONObject.fromObject(content);
if(resultObj.getBoolean("success")) {
//格式化为封装数据
ProductResponse productResponse = SpiderUtil.formatGapProductResponse(resultObj.getJSONObject("data"));
resultObj=JSONObject.fromObject(productResponse);
}
return resultObj;
}
private String getItemId(String targetUrl) {
String spuCode=targetUrl.substring(targetUrl.lastIndexOf("/")+1);
int firstUnder=spuCode.indexOf("_");
int lastUnder=spuCode.lastIndexOf("_");
return spuCode.substring(firstUnder+1,lastUnder);
}
}
package com.diaoyun.zion.chinafrica.bis.impl;
import com.diaoyun.zion.chinafrica.bis.IItemSpider;
import com.diaoyun.zion.chinafrica.enums.PlatformEnum;
import com.diaoyun.zion.chinafrica.vo.ProductResponse;
import com.diaoyun.zion.master.util.*;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException;
/**
* Gap数据爬虫
*/
@Component("nikeItemSpider")
public class NikeItemSpider implements IItemSpider {
private static Logger logger = LoggerFactory.getLogger(NikeItemSpider.class);
@Override
public JSONObject captureItem(String targetUrl) throws URISyntaxException, IOException, ExecutionException, InterruptedException, TimeoutException {
JSONObject resultObj;
//获取url中的网页内容 >
String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.NIKE.getValue());
//获取商品相关信息,详情放在<script> 标签的 window.INITIAL_REDUX_STATE 变量中
resultObj = JsoupUtil.getItemDetailByName(content, "window.INITIAL_REDUX_STATE");
//格式化为封装数据
ProductResponse productResponse = SpiderUtil.formatNikeProductResponse(resultObj);
resultObj = JSONObject.fromObject(productResponse);
return resultObj;
}
}
......@@ -38,8 +38,8 @@ public class TmItemSpider implements IItemSpider {
List<Map<String, Object>> futureList= new ArrayList<>();
//获取url中的网页内容
String content = HttpClientUtil.getContentByUrl(targetUrl,PlatformEnum.TM.getValue());
//获取商品详情
JSONObject infoMap= JsoupUtil.getTmItemDetail(content);
//获取商品详情 观察数据可发现商品数据在 _DATA_Detail 变量中
JSONObject infoMap= JsoupUtil.getItemDetailByName(content,"_DATA_Detail");
JSONObject skuBaseMap= (JSONObject) infoMap.get("skuBase");
if(!(skuBaseMap.get("props") instanceof JSONNull)) {
JSONArray propsArray= (JSONArray) skuBaseMap.get("props");
......
......@@ -24,4 +24,7 @@ public class KeyConstant {
/////////////////订单 END////////////////
//验证码前缀
public final static String CAPTCHA="captcha_";
//自定义id头部
public final static String CUSTOMIZE_ID="customizeId_";
}
......@@ -12,6 +12,8 @@ public enum PlatformEnum implements EnumItemable<PlatformEnum> {
TB("淘宝", "tb"),
TM("天猫", "tm"),
GAP("GAP", "gap"),
NIKE("NIKE", "nike"),
UN("未知", "un");
private String label;
......
......@@ -20,6 +20,14 @@ public class ItemSpiderFactory {
iItemSpider= (IItemSpider) SpringContextUtil.getBean("tmItemSpider");
break;
}
case "gap":{
iItemSpider= (IItemSpider) SpringContextUtil.getBean("gapItemSpider");
break;
}
case "nike":{
iItemSpider= (IItemSpider) SpringContextUtil.getBean("nikeItemSpider");
break;
}
default:{
iItemSpider= (IItemSpider) SpringContextUtil.getBean("emptyItemSpider");
break;
......
......@@ -47,6 +47,10 @@ public class SpiderServiceImpl implements SpiderService {
platformEnum=PlatformEnum.TB;
} else if(targetUrl.contains("tmall.com/item.htm")) {
platformEnum=PlatformEnum.TM;
} else if(targetUrl.contains("www.gap.cn/pdp/")) {
platformEnum=PlatformEnum.GAP;
} else if(targetUrl.contains("www.nike.com/cn/t/")) {
platformEnum=PlatformEnum.NIKE;
}
return platformEnum;
}
......
......@@ -9,5 +9,21 @@ public class DynStock {
//可用总的库存数
private int sellableQuantity;
//sku对应的库存数
private List<ProductSku> sku;
private List<ProductSkuStock> productSkuStockList;
public int getSellableQuantity() {
return sellableQuantity;
}
public void setSellableQuantity(int sellableQuantity) {
this.sellableQuantity = sellableQuantity;
}
public List<ProductSkuStock> getProductSkuStockList() {
return productSkuStockList;
}
public void setProductSkuStockList(List<ProductSkuStock> productSkuStockList) {
this.productSkuStockList = productSkuStockList;
}
}
......@@ -4,7 +4,7 @@ package com.diaoyun.zion.chinafrica.vo;
* 原始价格
*/
public class OriginalPrice {
//sku字符串 ;1627207:425613015;
//sku id标识 ;1627207:425613015;
private String skuStr;
//sku对应价格
private String price;
......
......@@ -4,7 +4,7 @@ package com.diaoyun.zion.chinafrica.vo;
* 商品促销价格
*/
public class ProductPromotion {
//sku字符串 ;1627207:425613015;
//sku id标识 ;1627207:425613015;
private String skuStr;
//sku对应价格
private String price;
......
......@@ -44,4 +44,24 @@ public class ProductProp {
public void setTranslate(String translate) {
this.translate = translate;
}
@Override
public boolean equals(Object obj) {
if(obj==null)
return false;
if(this==obj)
return true;
if(obj instanceof ProductProp) {
ProductProp productProp =(ProductProp) obj;
if(productProp.propId.equals(this.propId)) {
return true;
}
}
return false;
}
@Override
public int hashCode() {
return propId.hashCode();
}
}
package com.diaoyun.zion.chinafrica.vo;
import java.util.List;
import java.util.Set;
/**
* 商品属性list
*/
@Deprecated
public class ProductProps {
//属性名 比如颜色
private String name;
//翻译
private String translate;
//商品属性
private List<ProductProp> prop;
private Set<ProductProp> propSet;
public String getName() {
return name;
......@@ -29,11 +30,11 @@ public class ProductProps {
this.translate = translate;
}
public List<ProductProp> getProp() {
return prop;
public Set<ProductProp> getPropSet() {
return propSet;
}
public void setProp(List<ProductProp> prop) {
this.prop = prop;
public void setPropSet(Set<ProductProp> propSet) {
this.propSet = propSet;
}
}
package com.diaoyun.zion.chinafrica.vo;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* 爬取数据后,返回页面的商品详情数据
......@@ -8,36 +10,59 @@ import java.util.List;
public class ProductResponse {
//原始价格 有优惠的话还有优惠价
private List<OriginalPrice> originalPrice;
private List<OriginalPrice> originalPriceList;
//是否包促销价格 true 有促销价格,false\null没有促销价格
private boolean promotionFlag;
//促销价格
private List<ProductPromotion> promotion;
//一口价,就是商品一开始展示的价格,比如多sku的情况下展示 18.80-49.90
private List<ProductPromotion> promotionList;
//原价一口价,就是商品一开始展示的价格,比如多sku多价格的情况下展示 18.80-49.90
private String price;
//促销一口价
private String salePrice;
//是否包含库存信息 有些商品没有库存信息,可以当作是有货 true 有库存信息,false没有
private boolean stockFlag;
//库存
private DynStock dynStock;
//是否包含商品属性,有些商品没有属性
private boolean propFlag;
//商品属性
private List<ProductProps> propList;
//商品属性 颜色:红色,蓝色;尺码:S,l,M
private Map<String, Set<ProductProp>> productPropSet;
//商品信息
private ItemInfo itemInfo;
//商品来源平台 PlatformEnum
private String platform;
public List<OriginalPrice> getOriginalPrice() {
return originalPrice;
public boolean isStockFlag() {
return stockFlag;
}
public void setStockFlag(boolean stockFlag) {
this.stockFlag = stockFlag;
}
public List<OriginalPrice> getOriginalPriceList() {
return originalPriceList;
}
public void setOriginalPriceList(List<OriginalPrice> originalPriceList) {
this.originalPriceList = originalPriceList;
}
public void setOriginalPrice(List<OriginalPrice> originalPrice) {
this.originalPrice = originalPrice;
public List<ProductPromotion> getPromotionList() {
return promotionList;
}
public List<ProductPromotion> getPromotion() {
return promotion;
public String getSalePrice() {
return salePrice;
}
public void setPromotion(List<ProductPromotion> promotion) {
this.promotion = promotion;
public void setSalePrice(String salePrice) {
this.salePrice = salePrice;
}
public void setPromotionList(List<ProductPromotion> promotionList) {
this.promotionList = promotionList;
}
public String getPrice() {
......@@ -64,12 +89,12 @@ public class ProductResponse {
this.propFlag = propFlag;
}
public List<ProductProps> getPropList() {
return propList;
public Map<String, Set<ProductProp>> getProductPropSet() {
return productPropSet;
}
public void setPropList(List<ProductProps> propList) {
this.propList = propList;
public void setProductPropSet(Map<String, Set<ProductProp>> productPropSet) {
this.productPropSet = productPropSet;
}
public ItemInfo getItemInfo() {
......@@ -87,4 +112,12 @@ public class ProductResponse {
public void setPlatform(String platform) {
this.platform = platform;
}
public boolean isPromotionFlag() {
return promotionFlag;
}
public void setPromotionFlag(boolean promotionFlag) {
this.promotionFlag = promotionFlag;
}
}
......@@ -3,11 +3,11 @@ package com.diaoyun.zion.chinafrica.vo;
/**
* sku 库存
*/
public class ProductSku {
//sku拼接的字符串 ;1627207:425613015;
public class ProductSkuStock {
//sku id标识 ;1627207:425613015;
private String skuStr;
//可销售库存数量
private String sellableQuantity;
private int sellableQuantity;
public String getSkuStr() {
return skuStr;
......@@ -17,11 +17,11 @@ public class ProductSku {
this.skuStr = skuStr;
}
public String getSellableQuantity() {
public int getSellableQuantity() {
return sellableQuantity;
}
public void setSellableQuantity(String sellableQuantity) {
public void setSellableQuantity(int sellableQuantity) {
this.sellableQuantity = sellableQuantity;
}
}
......@@ -22,12 +22,12 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.io.UnsupportedEncodingException;
import java.net.*;
import java.nio.charset.Charset;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HttpClientUtil {
private static Logger logger = LoggerFactory.getLogger(HttpClientUtil.class);
......@@ -40,6 +40,7 @@ public class HttpClientUtil {
* @throws IOException
*/
public static String getContentByUrl(String sourceUrl, String sourceType) throws URISyntaxException, IOException {
sourceUrl= urlEncode(sourceUrl,Consts.UTF_8.name());
URL url = new URL(sourceUrl);
//构建URI
URI uri=new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), null);
......@@ -235,4 +236,25 @@ public class HttpClientUtil {
sibClient.close();
return sibContent;
}
/**
* 对链接进行url编码
* @param url
* @param chartSet
* @return
*/
public static String urlEncode(String url,String chartSet)
{
try {
Matcher matcher = Pattern.compile("[^\\x00-\\xff]").matcher(url);//双字节,包括中文和中文符号[^\x00-\xff] 中文[\u4e00-\u9fa5]
while (matcher.find()) {
String tmp=matcher.group();
url=url.replaceAll(tmp,java.net.URLEncoder.encode(tmp,chartSet));
}
} catch (UnsupportedEncodingException e) {
logger.error("双字节编码异常:", e);
}
return url;
}
}
......@@ -33,7 +33,7 @@ public class JsoupUtil {
String varArr[] = configGroup.split(";");
for (String variable : varArr) {
//获取g_config 变量
Pattern variablePattern = Pattern.compile("(var){1,1}\\s+(g_config){1,1}\\s+={1,1}[\\s\\S]*"); // Regex for the value of the key
Pattern variablePattern = Pattern.compile("(g_config){1,1}\\s+={1,1}[\\s\\S]*"); // Regex for the value of the key
Matcher varMatcher = variablePattern.matcher(variable);
while (varMatcher.find()) {
String configStr = varMatcher.group();
......@@ -86,7 +86,7 @@ public class JsoupUtil {
for (DataNode dataNode : element.dataNodes()) {
String dataStr = dataNode.getWholeData();
//获取带有 g_config 变量的 script 标签
Pattern p = Pattern.compile("(var){1,1}\\s+(" + variableName + "){1,1}\\s+={1,1}[\\s\\S]*(;){1,1}"); // Regex for the value of the key
Pattern p = Pattern.compile("(" + variableName + "){1,1}\\s*={1,1}[\\s\\S]*(;){1,1}"); // Regex for the value of the key
Matcher m = p.matcher(dataStr); // you have to use html here and NOT text! Text will drop the 'key' part
while ((m.find())) {
//System.out.println(m.group());
......@@ -210,33 +210,20 @@ public class JsoupUtil {
}
/**
* 获取天猫商品详情 手机端的,手机端在香港会返回与大陆不一样的页面信息
* 获取变量的值
*
* @param content
* @return
*/
public static JSONObject getTmItemDetail(String content) {
String variableName = "_DATA_Detail";
public static JSONObject getItemDetailByName(String content, String variableName) {
String detailStr = getScriptContent(content, variableName);
//Map<String, String> returnMap = new HashMap<>();
int firstBrackets=detailStr.indexOf("{");
int lastbrackets=detailStr.lastIndexOf("}");
detailStr=detailStr.substring(firstBrackets,lastbrackets+1);
JSONObject dataMap= JSONObject.fromObject(detailStr);
return dataMap;
}
/**
* 获取天猫商品详情
*
* @param content
* @return
*/
/* public static JSONObject getTmItemDetail(String content) {
//String variableName = "TShop.Setup";
String detailStr = getTmScriptContent(content);
JSONObject dataMap= JSONObject.fromObject(detailStr);
return dataMap;
}*/
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论