提交 a508f493 authored 作者: 梁业锦's avatar 梁业锦 💬

增加正则表达式判断商品详情页的链接

上级 ee1ac0ad
差异被折叠。
...@@ -17,6 +17,7 @@ import java.net.URISyntaxException; ...@@ -17,6 +17,7 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate; import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate;
...@@ -38,10 +39,12 @@ public class AdidasSpider implements IItemSpider { ...@@ -38,10 +39,12 @@ public class AdidasSpider implements IItemSpider {
@Override @Override
public JSONObject captureItem(String targetUrl) throws URISyntaxException, IOException, ExecutionException, InterruptedException, TimeoutException { public JSONObject captureItem(String targetUrl) throws URISyntaxException, IOException, ExecutionException, InterruptedException, TimeoutException {
// 截取商品的 id // 截取商品的 id
int labelHeadIndex = targetUrl.indexOf("item/"); final String regex = "\\w+\\d+";
int labelTailIndex = targetUrl.lastIndexOf("?"); Pattern pattern = Pattern.compile(regex);
String pId = targetUrl.substring(labelHeadIndex, labelTailIndex).replace("item/", ""); Matcher matcher = pattern.matcher(targetUrl);
// 对应的商品数据接口 matcher.find();
String pId = matcher.group(0);
// 对应的商品数据接口
targetUrl = "https://www.adidas.com.cn/item/othercolor?itemCode=" + pId; targetUrl = "https://www.adidas.com.cn/item/othercolor?itemCode=" + pId;
String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.ADIDAS.getValue()); String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.ADIDAS.getValue());
ProductResponse productResponse = formatProductResponse(content, pId); ProductResponse productResponse = formatProductResponse(content, pId);
...@@ -50,17 +53,6 @@ public class AdidasSpider implements IItemSpider { ...@@ -50,17 +53,6 @@ public class AdidasSpider implements IItemSpider {
return resultJson; return resultJson;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
* @param content 主要的页面数据 * @param content 主要的页面数据
......
...@@ -4,8 +4,8 @@ import com.diaoyun.zion.chinafrica.bis.IItemSpider; ...@@ -4,8 +4,8 @@ import com.diaoyun.zion.chinafrica.bis.IItemSpider;
import com.diaoyun.zion.chinafrica.enums.PlatformEnum; import com.diaoyun.zion.chinafrica.enums.PlatformEnum;
import com.diaoyun.zion.chinafrica.vo.*; import com.diaoyun.zion.chinafrica.vo.*;
import com.diaoyun.zion.master.util.HttpClientUtil; import com.diaoyun.zion.master.util.HttpClientUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import com.diaoyun.zion.master.util.SpiderUtil; import com.diaoyun.zion.master.util.SpiderUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import net.sf.json.JSONArray; import net.sf.json.JSONArray;
import net.sf.json.JSONObject; import net.sf.json.JSONObject;
import org.slf4j.Logger; import org.slf4j.Logger;
...@@ -17,7 +17,6 @@ import java.net.URISyntaxException; ...@@ -17,7 +17,6 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate; import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate;
...@@ -44,26 +43,11 @@ public class CoachSpider implements IItemSpider { ...@@ -44,26 +43,11 @@ public class CoachSpider implements IItemSpider {
String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.COACH.getValue()); String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.COACH.getValue());
JSONObject resultObj = JSONObject.fromObject(content); JSONObject resultObj = JSONObject.fromObject(content);
ProductResponse productResponse = formatProductResponse(resultObj, pId); ProductResponse productResponse = formatProductResponse(resultObj, pId);
if (productResponse.getItemInfo() == null) {
resultObj.put("message", "找不到此类网址的数据爬虫!");
return resultObj;
}
resultObj = JSONObject.fromObject(productResponse); resultObj = JSONObject.fromObject(productResponse);
TranslateHelper.translateProductResponse(resultObj); TranslateHelper.translateProductResponse(resultObj);
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
* @param dataMap 主要的Json数据 * @param dataMap 主要的Json数据
......
...@@ -17,7 +17,6 @@ import java.net.URISyntaxException; ...@@ -17,7 +17,6 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate; import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate;
...@@ -46,17 +45,6 @@ public class EspritSpider implements IItemSpider { ...@@ -46,17 +45,6 @@ public class EspritSpider implements IItemSpider {
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
* *
......
...@@ -49,17 +49,6 @@ public class GucciSpider implements IItemSpider { ...@@ -49,17 +49,6 @@ public class GucciSpider implements IItemSpider {
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
* @param content 主要的网页内容 * @param content 主要的网页内容
......
...@@ -5,8 +5,8 @@ import com.diaoyun.zion.chinafrica.enums.PlatformEnum; ...@@ -5,8 +5,8 @@ import com.diaoyun.zion.chinafrica.enums.PlatformEnum;
import com.diaoyun.zion.chinafrica.vo.*; import com.diaoyun.zion.chinafrica.vo.*;
import com.diaoyun.zion.master.util.HttpClientUtil; import com.diaoyun.zion.master.util.HttpClientUtil;
import com.diaoyun.zion.master.util.JsoupUtil; import com.diaoyun.zion.master.util.JsoupUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import com.diaoyun.zion.master.util.SpiderUtil; import com.diaoyun.zion.master.util.SpiderUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import net.sf.json.JSONArray; import net.sf.json.JSONArray;
import net.sf.json.JSONObject; import net.sf.json.JSONObject;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
...@@ -23,7 +23,6 @@ import java.net.URISyntaxException; ...@@ -23,7 +23,6 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate; import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate;
...@@ -52,17 +51,6 @@ public class HmSpider implements IItemSpider { ...@@ -52,17 +51,6 @@ public class HmSpider implements IItemSpider {
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
* *
...@@ -78,7 +66,9 @@ public class HmSpider implements IItemSpider { ...@@ -78,7 +66,9 @@ public class HmSpider implements IItemSpider {
resultStr = resultStr.replaceAll("\'", "\"") resultStr = resultStr.replaceAll("\'", "\"")
.replaceAll("\"image\": isDesktop [?] ", "") .replaceAll("\"image\": isDesktop [?] ", "")
.replaceAll("\"fullscreen\": isDesktop [?] ", "") .replaceAll("\"fullscreen\": isDesktop [?] ", "")
.replaceAll("\"zoom\": isDesktop [?] ", ""); .replaceAll("\"zoom\": isDesktop [?] ", "")
.replaceAll("isDesktop [?] \"//www2.hm.com/\" : ","");
System.err.println(resultStr);
JSONObject dataMap = JSONObject.fromObject(resultStr); JSONObject dataMap = JSONObject.fromObject(resultStr);
Document document = Jsoup.parse(content); Document document = Jsoup.parse(content);
......
...@@ -17,7 +17,6 @@ import java.net.URISyntaxException; ...@@ -17,7 +17,6 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate; import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate;
...@@ -50,17 +49,6 @@ public class LeviSpider implements IItemSpider { ...@@ -50,17 +49,6 @@ public class LeviSpider implements IItemSpider {
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
* @param dataMap 主要的Json数据 * @param dataMap 主要的Json数据
......
...@@ -45,17 +45,6 @@ public class MajeSpider implements IItemSpider { ...@@ -45,17 +45,6 @@ public class MajeSpider implements IItemSpider {
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
* @param content 主要的页面数据 * @param content 主要的页面数据
......
...@@ -4,8 +4,8 @@ import com.diaoyun.zion.chinafrica.bis.IItemSpider; ...@@ -4,8 +4,8 @@ import com.diaoyun.zion.chinafrica.bis.IItemSpider;
import com.diaoyun.zion.chinafrica.enums.PlatformEnum; import com.diaoyun.zion.chinafrica.enums.PlatformEnum;
import com.diaoyun.zion.chinafrica.vo.*; import com.diaoyun.zion.chinafrica.vo.*;
import com.diaoyun.zion.master.util.HttpClientUtil; import com.diaoyun.zion.master.util.HttpClientUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import com.diaoyun.zion.master.util.SpiderUtil; import com.diaoyun.zion.master.util.SpiderUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import net.sf.json.JSONArray; import net.sf.json.JSONArray;
import net.sf.json.JSONObject; import net.sf.json.JSONObject;
import org.slf4j.Logger; import org.slf4j.Logger;
...@@ -18,7 +18,6 @@ import java.net.URISyntaxException; ...@@ -18,7 +18,6 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
/** /**
* Massimo Dutti 数据爬虫 * Massimo Dutti 数据爬虫
...@@ -50,17 +49,6 @@ public class MassimoduttiSpider implements IItemSpider { ...@@ -50,17 +49,6 @@ public class MassimoduttiSpider implements IItemSpider {
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
* @param dataMap 主要的 json 数据 * @param dataMap 主要的 json 数据
......
...@@ -52,17 +52,6 @@ public class MocoSpider implements IItemSpider { ...@@ -52,17 +52,6 @@ public class MocoSpider implements IItemSpider {
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
* *
......
...@@ -20,7 +20,6 @@ import java.net.URISyntaxException; ...@@ -20,7 +20,6 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
/** /**
* OchirlySpider 数据爬虫 * OchirlySpider 数据爬虫
...@@ -46,16 +45,6 @@ public class OchirlySpider implements IItemSpider { ...@@ -46,16 +45,6 @@ public class OchirlySpider implements IItemSpider {
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
......
...@@ -4,8 +4,8 @@ import com.diaoyun.zion.chinafrica.bis.IItemSpider; ...@@ -4,8 +4,8 @@ import com.diaoyun.zion.chinafrica.bis.IItemSpider;
import com.diaoyun.zion.chinafrica.enums.PlatformEnum; import com.diaoyun.zion.chinafrica.enums.PlatformEnum;
import com.diaoyun.zion.chinafrica.vo.*; import com.diaoyun.zion.chinafrica.vo.*;
import com.diaoyun.zion.master.util.HttpClientUtil; import com.diaoyun.zion.master.util.HttpClientUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import com.diaoyun.zion.master.util.SpiderUtil; import com.diaoyun.zion.master.util.SpiderUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import net.sf.json.JSONArray; import net.sf.json.JSONArray;
import net.sf.json.JSONObject; import net.sf.json.JSONObject;
import org.slf4j.Logger; import org.slf4j.Logger;
...@@ -18,7 +18,6 @@ import java.net.URISyntaxException; ...@@ -18,7 +18,6 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
/** /**
* Oysho 数据爬虫 * Oysho 数据爬虫
...@@ -36,15 +35,8 @@ public class OyshoSpider implements IItemSpider { ...@@ -36,15 +35,8 @@ public class OyshoSpider implements IItemSpider {
*/ */
@Override @Override
public JSONObject captureItem(String targetUrl) throws InterruptedException, IOException, ExecutionException, URISyntaxException, TimeoutException { public JSONObject captureItem(String targetUrl) throws InterruptedException, IOException, ExecutionException, URISyntaxException, TimeoutException {
String pId = ""; String[] spilt = targetUrl.split("origenId=");
if (targetUrl.contains("origenId")) { String pId = spilt[1];
String[] spilt = targetUrl.split("origenId=");
pId = spilt[1];
} else {
String[] spilt = targetUrl.split("p");
spilt = spilt[2].split(".html");
pId = spilt[0].replaceAll(".html", "");
}
targetUrl = "https://www.oysho.cn/itxrest/2/catalog/store/65009628/60361118/category/0/product/" + pId + "/detail"; targetUrl = "https://www.oysho.cn/itxrest/2/catalog/store/65009628/60361118/category/0/product/" + pId + "/detail";
String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.OYSHO.getValue()); String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.OYSHO.getValue());
ProductResponse productResponse = formatProductResponse(content, pId); ProductResponse productResponse = formatProductResponse(content, pId);
...@@ -53,17 +45,6 @@ public class OyshoSpider implements IItemSpider { ...@@ -53,17 +45,6 @@ public class OyshoSpider implements IItemSpider {
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
* *
......
...@@ -4,8 +4,8 @@ import com.diaoyun.zion.chinafrica.bis.IItemSpider; ...@@ -4,8 +4,8 @@ import com.diaoyun.zion.chinafrica.bis.IItemSpider;
import com.diaoyun.zion.chinafrica.enums.PlatformEnum; import com.diaoyun.zion.chinafrica.enums.PlatformEnum;
import com.diaoyun.zion.chinafrica.vo.*; import com.diaoyun.zion.chinafrica.vo.*;
import com.diaoyun.zion.master.util.HttpClientUtil; import com.diaoyun.zion.master.util.HttpClientUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import com.diaoyun.zion.master.util.SpiderUtil; import com.diaoyun.zion.master.util.SpiderUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import net.sf.json.JSONArray; import net.sf.json.JSONArray;
import net.sf.json.JSONObject; import net.sf.json.JSONObject;
import org.slf4j.Logger; import org.slf4j.Logger;
...@@ -18,7 +18,6 @@ import java.net.URISyntaxException; ...@@ -18,7 +18,6 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
/** /**
* 西班牙年轻时尚品牌-PullAndBear 数据爬虫 * 西班牙年轻时尚品牌-PullAndBear 数据爬虫
...@@ -36,31 +35,16 @@ public class PullandbearSpider implements IItemSpider { ...@@ -36,31 +35,16 @@ public class PullandbearSpider implements IItemSpider {
*/ */
@Override @Override
public JSONObject captureItem(String targetUrl) throws URISyntaxException, IOException, ExecutionException, InterruptedException, TimeoutException { public JSONObject captureItem(String targetUrl) throws URISyntaxException, IOException, ExecutionException, InterruptedException, TimeoutException {
JSONObject resultJson = new JSONObject();
if (!urlPattern(targetUrl)) {
resultJson.put("message", "不是商品的详情页路径");
return resultJson;
}
String pId = targetUrl.substring(targetUrl.lastIndexOf("p")+1, targetUrl.lastIndexOf(".html")); String pId = targetUrl.substring(targetUrl.lastIndexOf("p")+1, targetUrl.lastIndexOf(".html"));
targetUrl = "https://www.pullandbear.cn/itxrest/2/catalog/store/24009528/20309423/category/0/product/" + pId + "/detail?languageId=-7&appId=1"; targetUrl = "https://www.pullandbear.cn/itxrest/2/catalog/store/24009528/20309423/category/0/product/" + pId + "/detail?languageId=-7&appId=1";
String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.PULLANDBEAR.getValue()); String content = HttpClientUtil.getContentByUrl(targetUrl, PlatformEnum.PULLANDBEAR.getValue());
resultJson = JSONObject.fromObject(content); JSONObject resultJson = JSONObject.fromObject(content);
ProductResponse productResponse = formatProductResponse(resultJson, pId); ProductResponse productResponse = formatProductResponse(resultJson, pId);
resultJson = JSONObject.fromObject(productResponse); resultJson = JSONObject.fromObject(productResponse);
TranslateHelper.translateProductResponse(resultJson); TranslateHelper.translateProductResponse(resultJson);
return resultJson; return resultJson;
} }
/**
* 正则匹配是否为商品详情页的链接
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "^.*pullandbear\\.cn/.*-c\\d{5,10}p\\d{9,10}.html\\?cS=\\d*";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化 PullAndBear 返回数据 * 格式化 PullAndBear 返回数据
* @see com.diaoyun.zion.chinafrica.bis.impl.PullandbearSpider * @see com.diaoyun.zion.chinafrica.bis.impl.PullandbearSpider
...@@ -84,7 +68,7 @@ public class PullandbearSpider implements IItemSpider { ...@@ -84,7 +68,7 @@ public class PullandbearSpider implements IItemSpider {
Map<String, Set<ProductProp>> productPropSet = new HashMap<>(16); Map<String, Set<ProductProp>> productPropSet = new HashMap<>(16);
Set<ProductProp> propSetColor = new HashSet<>(16); Set<ProductProp> propSetColor = new HashSet<>(16);
Set<ProductProp> sizePropSetSize = new HashSet<>(16); Set<ProductProp> sizePropSetSize = new HashSet<>(16);
productResponse.setStockFlag(true); productResponse.setStockFlag(false);
// 商品的基本属性 // 商品的基本属性
ItemInfo itemInfo = new ItemInfo(); ItemInfo itemInfo = new ItemInfo();
......
...@@ -4,8 +4,8 @@ import com.diaoyun.zion.chinafrica.bis.IItemSpider; ...@@ -4,8 +4,8 @@ import com.diaoyun.zion.chinafrica.bis.IItemSpider;
import com.diaoyun.zion.chinafrica.enums.PlatformEnum; import com.diaoyun.zion.chinafrica.enums.PlatformEnum;
import com.diaoyun.zion.chinafrica.vo.*; import com.diaoyun.zion.chinafrica.vo.*;
import com.diaoyun.zion.master.util.HttpClientUtil; import com.diaoyun.zion.master.util.HttpClientUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import com.diaoyun.zion.master.util.SpiderUtil; import com.diaoyun.zion.master.util.SpiderUtil;
import com.diaoyun.zion.master.util.TranslateHelper;
import net.sf.json.JSONObject; import net.sf.json.JSONObject;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
...@@ -20,11 +20,10 @@ import java.net.URISyntaxException; ...@@ -20,11 +20,10 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
/** /**
* Revolve 数据爬虫 * Revolve 数据爬虫
* * TODO 许多商品获取不到数据,待优化
* @author 爱酱油不爱醋 * @author 爱酱油不爱醋
*/ */
@Component("revolveSpider") @Component("revolveSpider")
...@@ -45,17 +44,6 @@ public class RevolveSpider implements IItemSpider { ...@@ -45,17 +44,6 @@ public class RevolveSpider implements IItemSpider {
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
* @param content 主要的网页内容 * @param content 主要的网页内容
......
...@@ -57,17 +57,6 @@ public class UniqloSpider implements IItemSpider { ...@@ -57,17 +57,6 @@ public class UniqloSpider implements IItemSpider {
return resultJson; return resultJson;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 返回格式化数据 * 返回格式化数据
* @param dataMap 调用优衣库网页接口接收的主要商品数据 * @param dataMap 调用优衣库网页接口接收的主要商品数据
......
...@@ -16,7 +16,6 @@ import java.net.URISyntaxException; ...@@ -16,7 +16,6 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate; import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate;
...@@ -52,17 +51,6 @@ public class UrbanRevivoSpider implements IItemSpider { ...@@ -52,17 +51,6 @@ public class UrbanRevivoSpider implements IItemSpider {
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化返回数据 * 格式化返回数据
* @param dataMap 主要的 json 数据 * @param dataMap 主要的 json 数据
......
...@@ -17,7 +17,6 @@ import java.net.URISyntaxException; ...@@ -17,7 +17,6 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate; import static com.diaoyun.zion.master.util.SpiderUtil.exchangeRate;
...@@ -48,17 +47,6 @@ public class ZaraSpider implements IItemSpider { ...@@ -48,17 +47,6 @@ public class ZaraSpider implements IItemSpider {
return resultObj; return resultObj;
} }
/**
* 正则匹配是否为商品详情页的链接
* TODO 正则编写
* @param targetUrl url路径
* @return 匹配失败则返回错误信息
*/
private boolean urlPattern(String targetUrl) {
String regex = "";
return Pattern.matches(regex, targetUrl);
}
/** /**
* 格式化 Zara 返回数据 * 格式化 Zara 返回数据
* @param dataMap 主要的 json 数据 * @param dataMap 主要的 json 数据
......
...@@ -14,6 +14,7 @@ import java.math.BigDecimal; ...@@ -14,6 +14,7 @@ import java.math.BigDecimal;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
/** /**
* 爬虫服务类 * 爬虫服务类
...@@ -51,7 +52,8 @@ public class SpiderServiceImpl implements SpiderService { ...@@ -51,7 +52,8 @@ public class SpiderServiceImpl implements SpiderService {
/** /**
* 判断链接属于哪个平台 * 判断链接属于哪个平台
* @param targetUrl 在商品详情页截取到的路径 * 先通过平台名判断所属平台,之后使用正则表达式判断是否为平台的商品详情页
* @param targetUrl 在app访问平台时获取到的链接
* @return 对应的爬虫 * @return 对应的爬虫
*/ */
private PlatformEnum judgeUrlType(String targetUrl) { private PlatformEnum judgeUrlType(String targetUrl) {
...@@ -60,7 +62,7 @@ public class SpiderServiceImpl implements SpiderService { ...@@ -60,7 +62,7 @@ public class SpiderServiceImpl implements SpiderService {
platformEnum = PlatformEnum.TB; platformEnum = PlatformEnum.TB;
} else if (targetUrl.contains("tmall.com/item.htm")) { } else if (targetUrl.contains("tmall.com/item.htm")) {
platformEnum = PlatformEnum.TM; platformEnum = PlatformEnum.TM;
} else if (targetUrl.contains("pullandbear")) { } else if (targetUrl.contains("pullandbear") && Pattern.matches("^.*pullandbear\\.cn/.*-c\\d{5,10}p\\d{9,10}.html\\?cS=\\d*$", targetUrl)) {
platformEnum = PlatformEnum.PULLANDBEAR; platformEnum = PlatformEnum.PULLANDBEAR;
} else if(targetUrl.contains("www.gap.cn/pdp/")) { } else if(targetUrl.contains("www.gap.cn/pdp/")) {
platformEnum=PlatformEnum.GAP; platformEnum=PlatformEnum.GAP;
...@@ -68,55 +70,55 @@ public class SpiderServiceImpl implements SpiderService { ...@@ -68,55 +70,55 @@ public class SpiderServiceImpl implements SpiderService {
platformEnum=PlatformEnum.NIKE; platformEnum=PlatformEnum.NIKE;
} else if(targetUrl.contains("www.afri-eshop.com") && targetUrl.contains("/products/")) { } else if(targetUrl.contains("www.afri-eshop.com") && targetUrl.contains("/products/")) {
platformEnum=PlatformEnum.AfriEshop; platformEnum=PlatformEnum.AfriEshop;
} else if (targetUrl.contains("zara")) { } else if (targetUrl.contains("zara") && Pattern.matches("^.*zara.cn.*-p\\d{8,10}.html.*$", targetUrl)) {
platformEnum = PlatformEnum.ZARA; platformEnum = PlatformEnum.ZARA;
} else if (targetUrl.contains("uniqlo") && targetUrl.contains("#/product?pid")) { } else if (targetUrl.contains("uniqlo") && Pattern.matches("^.*uniqlo.*\\/\\#\\/product\\?pid=u\\d+$", targetUrl)) {
platformEnum = PlatformEnum.UNIQLO; platformEnum = PlatformEnum.UNIQLO;
} else if (targetUrl.contains("hm.com/m") && targetUrl.contains("productpage")) { } else if (targetUrl.contains("hm") && Pattern.matches("^.*hm.com\\/.*\\/productpage.\\d{10,}.*$", targetUrl)) {
platformEnum = PlatformEnum.HM; platformEnum = PlatformEnum.HM;
} else if(targetUrl.contains("adidas.com") && targetUrl.contains("item")) { } else if(targetUrl.contains("adidas") && Pattern.matches("^.*adidas.*\\/item\\/\\w{6,10}.*$", targetUrl)) {
platformEnum=PlatformEnum.ADIDAS; platformEnum=PlatformEnum.ADIDAS;
} else if(targetUrl.contains("lily")) { } else if(targetUrl.contains("lily")) {
platformEnum=PlatformEnum.LILY; platformEnum=PlatformEnum.LILY;
} else if(targetUrl.contains("eifini")) { } else if(targetUrl.contains("eifini")) {
platformEnum=PlatformEnum.EIFINI; platformEnum=PlatformEnum.EIFINI;
} else if(targetUrl.contains("wap")) { } else if(targetUrl.contains("ur") && Pattern.matches("^.*ur.*\\/product\\/.*productColorId=\\w{24,}$", targetUrl)) {
platformEnum=PlatformEnum.URBANREVIVO; platformEnum=PlatformEnum.URBANREVIVO;
} else if(targetUrl.contains("abercrombie")) { } else if(targetUrl.contains("abercrombie") && Pattern.matches("^.*abercrombie.*\\/anf-\\d{6,}.*$", targetUrl)) {
platformEnum=PlatformEnum.ABERCROMBIEFITCH; platformEnum=PlatformEnum.ABERCROMBIEFITCH;
} else if(targetUrl.contains("ochirly.com")) { } else if(targetUrl.contains("ochirly") && Pattern.matches("^.*ochirly.*\\/p\\/.*\\w{10,}.*$", targetUrl)) {
platformEnum=PlatformEnum.OCHIRLY; platformEnum=PlatformEnum.OCHIRLY;
} else if(targetUrl.contains("esprit")) { } else if(targetUrl.contains("esprit") && Pattern.matches("^.*esprit.*\\/product\\/\\w{24,}.html.*$", targetUrl)) {
platformEnum=PlatformEnum.ESPRIT; platformEnum=PlatformEnum.ESPRIT;
} else if(targetUrl.contains("levi.com")) { } else if(targetUrl.contains("levi") && Pattern.matches("^.*levi.com.*\\/product\\/\\w{32}.html.*styleNo=.*$", targetUrl)) {
platformEnum=PlatformEnum.LEVI; platformEnum=PlatformEnum.LEVI;
} else if(targetUrl.contains("moco.com/moco/")) { } else if(targetUrl.contains("moco") && Pattern.matches("^.*moco.com\\/moco/.*\\/p\\/\\w{10,}.*$", targetUrl)) {
platformEnum=PlatformEnum.MOCO; platformEnum=PlatformEnum.MOCO;
} else if (targetUrl.contains("massimodutti")) { } else if (targetUrl.contains("massimodutti") && Pattern.matches("^.*massimodutti.cn.*\\/.*-c\\d+p\\d+.html.*$", targetUrl)) {
platformEnum = PlatformEnum.MASSIMODUTTI; platformEnum = PlatformEnum.MASSIMODUTTI;
} else if (targetUrl.contains("coach")) { } else if (targetUrl.contains("coach") && Pattern.matches("^.*coach.com\\/.*\\/.*.html\\?c=\\d+$", targetUrl)) {
platformEnum = PlatformEnum.COACH; platformEnum = PlatformEnum.COACH;
} else if (targetUrl.contains("revolve")) { } else if (targetUrl.contains("revolve") && Pattern.matches("^.*revolve.*\\/.*\\/dp\\/\\w+-\\w+\\/.*$", targetUrl)) {
platformEnum = PlatformEnum.REVOLVE; platformEnum = PlatformEnum.REVOLVE;
} else if (targetUrl.contains("vans.com")) { } else if (targetUrl.contains("vans") && Pattern.matches("^.*vans.com.*\\/product-\\d+.*$", targetUrl)) {
platformEnum = PlatformEnum.VANS; platformEnum = PlatformEnum.VANS;
} else if (targetUrl.contains("oysho")) { } else if (targetUrl.contains("oysho") && Pattern.matches("^.*oysho.*\\/.*-c\\d+p\\d+.html\\?origenId=\\d+$", targetUrl)) {
platformEnum = PlatformEnum.OYSHO; platformEnum = PlatformEnum.OYSHO;
} else if (targetUrl.contains("stradivarius")) { } else if (targetUrl.contains("stradivarius") && Pattern.matches("^.*stradivarius.*\\/.*c\\d+p\\d+.html.*$", targetUrl)) {
platformEnum = PlatformEnum.STRADIVARIUS; platformEnum = PlatformEnum.STRADIVARIUS;
} else if (targetUrl.contains("maje")) { } else if (targetUrl.contains("maje") && Pattern.matches("^.*maje.*/\\w{8,}.html.*$", targetUrl)) {
platformEnum = PlatformEnum.MAJE; platformEnum = PlatformEnum.MAJE;
} else if (targetUrl.contains("gucci")) { } else if (targetUrl.contains("gucci") && Pattern.matches("^.*gucci.*pr\\/\\w{10,}.*$", targetUrl)) {
platformEnum = PlatformEnum.GUCCI; platformEnum = PlatformEnum.GUCCI;
} else if (targetUrl.contains("burberry.com")) { } else if (targetUrl.contains("burberry") && Pattern.matches("^.*burberry.*\\/.*-p\\d{8,}$", targetUrl)) {
platformEnum = PlatformEnum.BURBERRY; platformEnum = PlatformEnum.BURBERRY;
} else if (targetUrl.contains("prada.com")) { } else if (targetUrl.contains("prada") && Pattern.matches("^.*prada.*\\/products.*$", targetUrl)) {
platformEnum = PlatformEnum.PRADA; platformEnum = PlatformEnum.PRADA;
} else if (targetUrl.contains("fendi")) { } else if (targetUrl.contains("fendi") && Pattern.matches("^.*fendi.*\\/p-\\w{10,}.*$", targetUrl)) {
platformEnum = PlatformEnum.FENDI; platformEnum = PlatformEnum.FENDI;
} else if (targetUrl.contains("apple")) { } else if (targetUrl.contains("apple")&& Pattern.matches("^.*apple.*\\/buy-.*\\/.*\\/\\w+.*$", targetUrl)) {
platformEnum = PlatformEnum.APPLE; platformEnum = PlatformEnum.APPLE;
} else if (targetUrl.contains("louisvuitton")) { } else if (targetUrl.contains("louisvuitton") && Pattern.matches("^.*fendi.*\\/p-\\w{10,}.*$", targetUrl)) {
platformEnum = PlatformEnum.LOUISVUITTON; platformEnum = PlatformEnum.LOUISVUITTON;
} }
return platformEnum; return platformEnum;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论