提交 fda39094 authored 作者: 梁业锦's avatar 梁业锦 💬

已完善Zara与优衣库的爬虫,增加了Massimodutti的爬虫

上级 e768cea5
......@@ -30,11 +30,11 @@
### Gap
- 主页:https://www.gap.cn/
- 命名:gap
- 爬虫进度:已完成
- 爬虫进度:**已完成**
### Zara
- 主页:https://www.zara.cn/cn
- 命名:zara
- 爬虫进度:已完成
- 爬虫进度:**已完成**
### Uniqlo
- 主页:https://www.uniqlo.cn/UNIQLO_U19FW_MEN.html
- 命名:uniqlo
......@@ -44,7 +44,7 @@
### Nike
- 主页:https://www.nike.com/cn
- 命名:nike
- 爬虫进度:已完成
- 爬虫进度:**已完成**
### Adidas
- 主页:https://www.adidas.com.cn/
- 命名:adidas
......@@ -52,7 +52,9 @@
### H&M
- 主页:https://www2.hm.com/zh_cn/
- 命名:hm
- 爬虫进度:
- 爬虫进度:已能获取到数据
- 图片路径处理难度太多
- 商品颜色通过商品详情页的url来区分,暂未找到规律
### LiLy
- 主页:http://www.lily.sh.cn/webapp/wcs/stores/servlet/lilystore
- 命名:lily
......@@ -65,16 +67,43 @@
- 主页:http://www.ur.cn/index.html
- 命名:ur
- 爬虫进度:
### Aber Crombie
- 主页:https://www.abercrombie.cn/zh_CN/home
- 命名:abercrombie
- 爬虫进度:
### Under Armour
- 主页:https://www.underarmour.cn/
- 命名:ur
- 爬虫进度:
### converse
- 主页:https://www.converse.com.cn/
- 命名:converse
- 爬虫进度:
### Ochirly
- 主页:http://www.ochirly.com.cn/SALE/list.shtml
- 命名:ochirly
- 爬虫进度:
### Esprit
- 主页:https://www.esprit.cn/
- 命名:esprit
- 爬虫进度:
### Levi
- 主页:https://www.levi.com.cn/sale#page=3
- 命名:levi
- 爬虫进度:
### Moco
- 主页:https://www.moco.com/moco/zh/c/BS_DISCOUNT
- 命名:moco
- 爬虫进度:
### Massimo Dutti
- 主页:https://www.massimodutti.cn/cn/男装/季末折扣/休闲西装-c1745921.html
- 命名:massimodutti
- 爬虫进度:**已完成**
- 数据来源
- 商品详情:https://www.massimodutti.cn/cn/%E5%A5%B3%E8%A3%85/%E7%B3%BB%E5%88%97/%E8%A1%AC%E8%A1%AB%E5%92%8C%E7%BD%A9%E8%A1%AB/%E8%A1%AC%E8%A1%AB/%E6%BB%91%E9%9B%AA%E9%A3%8E%E7%B3%BB%E5%88%97%E9%A5%B0%E5%8F%A3%E8%A2%8B%E8%A1%AC%E8%A1%AB-c1718602p8730105.html?colorId=420&categoryId=1718602
- 数据接口:https://www.massimodutti.cn/itxrest/2/catalog/store/35009478/30359500/category/0/product/8730105/detail?languageId=-7&appId=1
### 待选爬虫网站:
- 10, ( UR, the most famous fashion brand in china)
- 11,https://www.abercrombie.cn/zh_CN/home (A&F, America brand)
- 12,https://www.underarmour.cn/ (America famous sport brand)
- 13,https://www.converse.com.cn/ (Famous American Canvas Shoes Brand)
- 14,http://www.ochirly.com.cn/SALE/list.shtml (one of the best fashion brand in china)
- 15,https://www.esprit.cn/ (America casual fashion brand)
- 16,https://www.levi.com.cn/sale#page=3 (levis)
- 17,https://www.moco.com/moco/zh/c/BS_DISCOUNT (one of the best fashion brand in china)
- 18,https://www.massimodutti.cn/cn/男装/季末折扣/休闲西装-c1745921.html ( 西班牙品牌)
- 19,https://china.coach.com/women.html
- 20,https://www.revolve.com/wrangler/br/57f1a1/?utm_source=baidu&utm_medium=cpc&utm_campaign=intl_P_cn-d-Wrangler (美国轻奢品牌集合网站)
......
package com.diaoyun.zion.chinafrica.bis.impl;
import com.alibaba.druid.support.json.JSONUtils;
import com.diaoyun.zion.chinafrica.bis.IItemSpider;
import com.diaoyun.zion.chinafrica.enums.PlatformEnum;
import com.diaoyun.zion.master.util.HttpClientUtil;
import com.diaoyun.zion.master.util.JsoupUtil;
import net.sf.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException;
/**
* H&M 数据爬虫
*
* @author 爱酱油不爱醋
*/
@Component("hmSpider")
public class HmSpider implements IItemSpider {
private static Logger logger = LoggerFactory.getLogger(UniqloSpider.class);
// H&M 详情商品页url
private static final String uniqloUrl = "";
@Override
public JSONObject captureItem(String targetUrl) throws URISyntaxException, IOException, ExecutionException, InterruptedException, TimeoutException {
String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.HM.getValue());
String detailStr = JsoupUtil.getScriptContent(content, "productArticleDetails");
int firstBrackets=detailStr.indexOf("{");
int lastbrackets=detailStr.lastIndexOf("}");
String resultStr = detailStr.substring(firstBrackets,lastbrackets+1);
int firstImage = detailStr.indexOf("'images':[");
int lastImage = detailStr.lastIndexOf("'video':");
detailStr = detailStr.substring(firstImage, lastImage);
resultStr = resultStr.replace(detailStr, "");
JSONObject resultObj = JSONObject.fromObject(resultStr);
return resultObj;
}
}
package com.diaoyun.zion.chinafrica.bis.impl;
import com.diaoyun.zion.chinafrica.bis.IItemSpider;
import com.diaoyun.zion.chinafrica.enums.PlatformEnum;
import com.diaoyun.zion.chinafrica.vo.ProductResponse;
import com.diaoyun.zion.master.util.HttpClientUtil;
import com.diaoyun.zion.master.util.JsoupUtil;
import com.diaoyun.zion.master.util.SpiderUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import net.sf.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException;
/**
* Massimo Dutti 数据爬虫
*
* @author 爱酱油不爱醋
*/
@Component("massimoduttiSpider")
public class MassimoduttiSpider implements IItemSpider {
private static Logger logger = LoggerFactory.getLogger(ZaraSpider.class);
/**
* Massimo Dutti 商品详情页Url
*/
private static final String MASSIMO_DUTTI_URL = "https://www.massimodutti.cn/cn/";
/**
* Massimo Dutti 数据爬虫
* @see com.diaoyun.zion.chinafrica.service.impl.SpiderServiceImpl#judgeUrlType 修改商品详情页路径
* @see SpiderUtil#formatMassimoDuttiProductResponse 格式化数据方法
* @param targetUrl 接收的商品详情路径
* @return 格式化与翻译后的 Json 数据
*/
@Override
public JSONObject captureItem(String targetUrl) throws URISyntaxException, IOException, ExecutionException, InterruptedException, TimeoutException {
// 截取链接中商品的 id
String[] split = targetUrl.split(".html");
String pUrl = split[0];
int pChar = pUrl.lastIndexOf("p");
String pId = pUrl.substring(pChar + 1);
String dataUrl = "https://www.massimodutti.cn/itxrest/2/catalog/store/35009478/30359500/category/0/product/" + pId + "/detail?languageId=-7&appId=1";
String content = HttpClientUtil.getContentByUrl(dataUrl, PlatformEnum.MASSIMODUTTI.getValue());
JSONObject resultObj = JSONObject.fromObject(content);
ProductResponse productResponse = SpiderUtil.formatMassimoDuttiProductResponse(resultObj, pId);
resultObj = JSONObject.fromObject(productResponse);
TranslateHelper.translateProductResponse(resultObj);
return resultObj;
}
}
......@@ -23,44 +23,24 @@ public class PullandbearSpider implements IItemSpider {
private static Logger logger = LoggerFactory.getLogger(PullandbearSpider.class);
//PullandBear商品详情
private static final String pullandbearUrl="https://www.pullandbear.cn/itxrest/2/catalog/store/24009528/20309423/category/0/product/";
/**
* Pull and Bear 商品详情链接
*/
private static final String PULL_AND_BEAR_URL="https://www.pullandbear.cn/itxrest/2/catalog/store/24009528/20309423/category/0/product/";
@Override
public JSONObject captureItem(String targetUrl) throws URISyntaxException, IOException, ExecutionException, InterruptedException, TimeoutException {
// 提取链接中的商品 id 信息
String pId = targetUrl.substring(targetUrl.lastIndexOf("p")+1, targetUrl.lastIndexOf(".html"));
// 组成链接并获取商品详情信息的 Json
targetUrl = pullandbearUrl + pId + "/detail?languageId=-7&appId=1";
targetUrl = PULL_AND_BEAR_URL + pId + "/detail?languageId=-7&appId=1";
// 获取网页内容
String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.PULLANDBEAR.getValue());
// 转换为Json格式
JSONObject json = JSONObject.fromObject(content);
// 简化Json格式
json = getMainData(json, content);
return json;
JSONObject resultJson = JSONObject.fromObject(content);
return resultJson;
}
/**
* 简化Json格式
* @param json
* @param content
* @return
*/
private JSONObject getMainData(JSONObject json, String content) {
json = json.getJSONArray("bundleProductSummaries").getJSONObject(0);
// 商品id
String productId = json.getString("id");
// 商品名称
String productName = json.getString("name");
JSONArray colors = json.getJSONObject("detail").getJSONArray("colors");
// 返回数据
JSONObject returnJson = new JSONObject();
returnJson.put("id", productId);
returnJson.put("name", productName);
returnJson.put("data", colors);
return returnJson;
}
}
......
......@@ -20,34 +20,51 @@ import java.util.concurrent.TimeoutException;
/**
* 优衣库数据爬虫
*
* 图片路径为:"https://www.uniqlo.cn/hmall/test/" + 商品id + "/sku/561/" + 商品图片id + ".jpg"
*
* @author 爱酱油不爱醋
*/
@Component("uniqloSpider")
public class UniqloSpider implements IItemSpider {
private static Logger logger = LoggerFactory.getLogger(UniqloSpider.class);
// 优衣库数据爬虫
private static final String uniqloUrl = "https://www.uniqlo.cn/data/products/spu/";
/**
* Uniqlo 商品详情链接
*/
private static final String UNIQLO_URL = "https://www.uniqlo.cn/data/products/spu/";
/**
* Uniqlo 数据爬虫
* @see com.diaoyun.zion.chinafrica.service.impl.SpiderServiceImpl#judgeUrlType 修改商品详情页路径
* @see SpiderUtil#formatUniqloProductResponse 格式化数据方法
* @param targetUrl 接收的商品详情路径
* @return 格式化与翻译后的 Json 数据
*/
@Override
public JSONObject captureItem(String targetUrl) throws URISyntaxException, IOException, ExecutionException, InterruptedException, TimeoutException {
// 获取商品 id
// 截取链接中的商品 id
String[] split = targetUrl.split("productCode=");
String pId = split[1];
// 获取商品详情的json链接
targetUrl = uniqloUrl + "zh_CN/" + pId + ".json";
// 获取网页内容
targetUrl = UNIQLO_URL + "zh_CN/" + pId + ".json";
String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.UNIQLO.getValue());
// 转换为 json
// 获取折扣价格的接口
String priceUrl = "https://d.uniqlo.cn/p/product/i/product/spu/pc/query/" + pId + "/zh_CN";
String priceContent = HttpClientUtil.getContentByUrl(priceUrl, PlatformEnum.UNIQLO.getValue());
JSONObject resultJson = JSONObject.fromObject(content);
// 按照封装规范封装商品数据
ProductResponse productResponse = SpiderUtil.formatUniqloProductResponse(resultJson, pId);
JSONObject priceJson = JSONObject.fromObject(priceContent);
ProductResponse productResponse = SpiderUtil.formatUniqloProductResponse(resultJson, priceJson, pId);
resultJson = JSONObject.fromObject(productResponse);
// 翻译
TranslateHelper.translateProductResponse(resultJson);
return resultJson;
}
public static void main(String[] args) throws Exception {
String targetUrl = "https://www.uniqlo.cn/product-detail.html?productCode=u0000000001970";
// 截取链接中的商品 id
String[] split = targetUrl.split("productCode=");
String pId = split[1];
targetUrl = UNIQLO_URL + "zh_CN/" + pId + ".json";
String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.UNIQLO.getValue());
System.out.println(content);
}
}
......@@ -29,21 +29,26 @@ import java.util.concurrent.TimeoutException;
public class ZaraSpider implements IItemSpider {
private static Logger logger = LoggerFactory.getLogger(ZaraSpider.class);
//西班牙时尚品牌数据爬虫
private static final String zaraUrl = "https://www.zara.cn/cn/zh/";
/**
* Zara 商品详情页Url
*/
private static final String ZARA_URL = "https://www.zara.cn/cn/zh/";
/**
* Massimo Dutti 数据爬虫
* @see com.diaoyun.zion.chinafrica.service.impl.SpiderServiceImpl#judgeUrlType 修改商品详情页路径
* @see JsoupUtil#getZaraJsonData 返回截取到的主要商品数据
* @see SpiderUtil#formatZaraProductResponse 格式化数据方法
* @param targetUrl 接收的商品详情路径
* @return 格式化与翻译后的 Json 数据
*/
@Override
public JSONObject captureItem(String targetUrl) throws URISyntaxException, IOException, ExecutionException, InterruptedException, TimeoutException {
JSONObject resultObj;
// 获取url中的网页内容
String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.ZARA.getValue());
// 截取主要的 Json 内容
resultObj = JsoupUtil.getZaraJsonData(content);
// 按照封装规范封装商品数据
ProductResponse productResponse = SpiderUtil.formatZaraProductResponse(resultObj);
// 将封装数据转换为 json 数据
resultObj = JSONObject.fromObject(productResponse);
// 翻译
TranslateHelper.translateProductResponse(resultObj);
return resultObj;
}
......
......@@ -17,13 +17,14 @@ public enum PlatformEnum implements EnumItemable<PlatformEnum> {
ZARA("Zara", "zara"),
UNIQLO("优衣库", "uniqlo"),
NIKE("NIKE", "nike"),
HM("H&M", "hm"),
MASSIMODUTTI("MassimoDutti", "massimodutti"),
UN("未知", "un"),
AfriEshop("afri-eshop","afri-eshop" );
private String label;
private String value;
PlatformEnum(String label, String value) {
this.label = label;
this.value = value;
......
......@@ -40,6 +40,14 @@ public class ItemSpiderFactory {
iItemSpider= (IItemSpider) SpringContextUtil.getBean("nikeItemSpider");
break;
}
case "hm":{
iItemSpider= (IItemSpider) SpringContextUtil.getBean("hmSpider");
break;
}
case "massimodutti":{
iItemSpider= (IItemSpider) SpringContextUtil.getBean("massimoduttiSpider");
break;
}
case "afri-eshop":{
iItemSpider= (IItemSpider) SpringContextUtil.getBean("africaShopItemSpider");
break;
......
......@@ -43,10 +43,12 @@ public class SpiderServiceImpl implements SpiderService {
private PlatformEnum judgeUrlType(String targetUrl) {
PlatformEnum platformEnum = PlatformEnum.UN;
if(targetUrl.contains("taobao.com")&&(targetUrl.contains("item.htm")||targetUrl.contains("detail.htm"))) {
platformEnum=PlatformEnum.TB;
} else if(targetUrl.contains("tmall.com/item.htm")) {
platformEnum=PlatformEnum.TM;
if (targetUrl.contains("taobao.com") && (targetUrl.contains("item.htm") || targetUrl.contains("detail.htm"))) {
platformEnum = PlatformEnum.TB;
} else if (targetUrl.contains("tmall.com/item.htm")) {
platformEnum = PlatformEnum.TM;
} else if (targetUrl.contains("pullandbear.cn/cn/")) {
platformEnum = PlatformEnum.PULLANDBEAR;
} else if(targetUrl.contains("www.gap.cn/pdp/")) {
platformEnum=PlatformEnum.GAP;
} else if(targetUrl.contains("www.nike.com/cn/t/")) {
......@@ -54,9 +56,13 @@ public class SpiderServiceImpl implements SpiderService {
} else if(targetUrl.contains("www.afri-eshop.com")&&targetUrl.contains("/products/")) {
platformEnum=PlatformEnum.AfriEshop;
} else if (targetUrl.contains("zara.cn")) {
platformEnum = platformEnum.ZARA;
platformEnum = PlatformEnum.ZARA;
} else if (targetUrl.contains("uniqlo.cn/product-detail.html")) {
platformEnum = platformEnum.UNIQLO;
platformEnum = PlatformEnum.UNIQLO;
} else if (targetUrl.contains("hm.com/zh_cn/productpage")) {
platformEnum = PlatformEnum.HM;
} else if (targetUrl.contains("massimodutti.cn/")) {
platformEnum = PlatformEnum.MASSIMODUTTI;
}
return platformEnum;
}
......
......@@ -2,17 +2,29 @@ package com.diaoyun.zion.chinafrica.vo;
/**
* 商品信息
*
* @author G
*/
public class ItemInfo {
//
/**
* 商品 id
*/
private String itemId;
//商品标题
/**
* 商品标题
*/
private String title;
//商品主图
/**
* 商品主图
*/
private String pic;
//所属店铺
/**
* 所属店铺
*/
private String shopName;
//店铺链接
/**
* 店铺链接
*/
private String shopUrl;
public String getItemId() {
......
......@@ -56,15 +56,6 @@ public class ProductResponse {
*/
private String platform;
public boolean isStockFlag() {
return stockFlag;
}
public void setStockFlag(boolean stockFlag) {
this.stockFlag = stockFlag;
}
public List<OriginalPrice> getOriginalPriceList() {
return originalPriceList;
}
......@@ -73,16 +64,16 @@ public class ProductResponse {
this.originalPriceList = originalPriceList;
}
public List<ProductPromotion> getPromotionList() {
return promotionList;
public boolean isPromotionFlag() {
return promotionFlag;
}
public String getSalePrice() {
return salePrice;
public void setPromotionFlag(boolean promotionFlag) {
this.promotionFlag = promotionFlag;
}
public void setSalePrice(String salePrice) {
this.salePrice = salePrice;
public List<ProductPromotion> getPromotionList() {
return promotionList;
}
public void setPromotionList(List<ProductPromotion> promotionList) {
......@@ -97,6 +88,22 @@ public class ProductResponse {
this.price = price;
}
public String getSalePrice() {
return salePrice;
}
public void setSalePrice(String salePrice) {
this.salePrice = salePrice;
}
public boolean isStockFlag() {
return stockFlag;
}
public void setStockFlag(boolean stockFlag) {
this.stockFlag = stockFlag;
}
public DynStock getDynStock() {
return dynStock;
}
......@@ -136,12 +143,4 @@ public class ProductResponse {
public void setPlatform(String platform) {
this.platform = platform;
}
public boolean isPromotionFlag() {
return promotionFlag;
}
public void setPromotionFlag(boolean promotionFlag) {
this.promotionFlag = promotionFlag;
}
}
......@@ -84,7 +84,7 @@ public class JsoupUtil {
* @param variableName
* @return
*/
private static String getScriptContent(String content, String variableName) {
public static String getScriptContent(String content, String variableName) {
//logger.info(content);
Document document = Jsoup.parse(content);
Elements elementList = document.getElementsByTag("script");
......@@ -150,6 +150,22 @@ public class JsoupUtil {
}
}
public static void main(String[] args) throws Exception {
String targetUrl = "https://www2.hm.com/zh_cn/productpage.0809313001.html";
String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.HM.getValue());
String detailStr = getScriptContent(content, "productArticleDetails");
int firstBrackets=detailStr.indexOf("{");
int lastbrackets=detailStr.lastIndexOf("}");
String resultStr = detailStr.substring(firstBrackets,lastbrackets+1);
int firstImage = detailStr.indexOf("'images':[");
int lastImage = detailStr.lastIndexOf("'video':");
detailStr = detailStr.substring(firstImage, lastImage);
resultStr = resultStr.replace(detailStr, "");
JSONObject resultObj = JSONObject.fromObject(resultStr);
System.out.println(resultObj);
}
/**
* 解析出商品详情
*
......@@ -232,7 +248,7 @@ public class JsoupUtil {
}
/**
* 获取Zara爬虫的主要数据
* 获取 Zara 爬虫的主要数据
* @param content
* @return
*/
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论