Skip to content
项目
群组
代码片段
帮助
正在加载...
帮助
为 GitLab 提交贡献
登录/注册
切换导航
Z
zion
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
分枝图
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
计划
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
分枝图
统计图
创建新议题
作业
提交
议题看板
打开侧边栏
zhengfg
zion
Commits
43eddb1b
提交
43eddb1b
authored
10月 07, 2019
作者:
梁业锦
💬
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
添加了pullandbear、unqlo、zara的数据爬虫,待完善数据格式化
上级
7bbbceb8
隐藏空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
257 行增加
和
7 行删除
+257
-7
PullandbearSpider.java
...m/diaoyun/zion/chinafrica/bis/impl/PullandbearSpider.java
+69
-0
UniqloSpider.java
...va/com/diaoyun/zion/chinafrica/bis/impl/UniqloSpider.java
+54
-0
ZaraSpider.java
...java/com/diaoyun/zion/chinafrica/bis/impl/ZaraSpider.java
+106
-0
PlatformEnum.java
.../java/com/diaoyun/zion/chinafrica/enums/PlatformEnum.java
+3
-0
ItemSpiderFactory.java
...om/diaoyun/zion/chinafrica/factory/ItemSpiderFactory.java
+12
-0
SpiderServiceImpl.java
...aoyun/zion/chinafrica/service/impl/SpiderServiceImpl.java
+13
-7
没有找到文件。
src/main/java/com/diaoyun/zion/chinafrica/bis/impl/PullandbearSpider.java
0 → 100644
浏览文件 @
43eddb1b
package
com
.
diaoyun
.
zion
.
chinafrica
.
bis
.
impl
;
import
com.diaoyun.zion.chinafrica.bis.IItemSpider
;
import
com.diaoyun.zion.chinafrica.enums.PlatformEnum
;
import
com.diaoyun.zion.master.util.HttpClientUtil
;
import
net.sf.json.JSONArray
;
import
net.sf.json.JSONObject
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.springframework.stereotype.Component
;
import
java.io.IOException
;
import
java.net.URISyntaxException
;
import
java.util.concurrent.ExecutionException
;
import
java.util.concurrent.TimeoutException
;
/**
* 西班牙年轻时尚品牌-pullandbear 数据爬虫
*
* TODO 图片路径未处理
* 图片路径为:
*/
@Component
(
"pullandbearSpider"
)
public
class
PullandbearSpider
implements
IItemSpider
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
PullandbearSpider
.
class
);
//PullandBear商品详情
private
static
final
String
pullandbearUrl
=
"https://www.pullandbear.cn/itxrest/2/catalog/store/24009528/20309423/category/0/product/"
;
@Override
public
JSONObject
captureItem
(
String
targetUrl
)
throws
URISyntaxException
,
IOException
,
ExecutionException
,
InterruptedException
,
TimeoutException
{
// 提取链接中的商品 id 信息
String
pId
=
targetUrl
.
substring
(
targetUrl
.
lastIndexOf
(
"p"
)+
1
,
targetUrl
.
lastIndexOf
(
".html"
));
// 组成链接并获取商品详情信息的 Json
targetUrl
=
pullandbearUrl
+
pId
+
"/detail?languageId=-7&appId=1"
;
// 获取网页内容
String
content
=
HttpClientUtil
.
getContentByUrl
(
targetUrl
,
PlatformEnum
.
PULLANDBEAR
.
getValue
());
// 转换为Json格式
JSONObject
json
=
JSONObject
.
fromObject
(
content
);
// 简化Json格式
json
=
getMainData
(
json
,
content
);
return
json
;
}
/**
* 简化Json格式
* @param json
* @param content
* @return
*/
private
JSONObject
getMainData
(
JSONObject
json
,
String
content
)
{
json
=
json
.
getJSONArray
(
"bundleProductSummaries"
).
getJSONObject
(
0
);
// 商品id
String
productId
=
json
.
getString
(
"id"
);
// 商品名称
String
productName
=
json
.
getString
(
"name"
);
JSONArray
colors
=
json
.
getJSONObject
(
"detail"
).
getJSONArray
(
"colors"
);
// 返回数据
JSONObject
returnJson
=
new
JSONObject
();
returnJson
.
put
(
"id"
,
productId
);
returnJson
.
put
(
"name"
,
productName
);
returnJson
.
put
(
"data"
,
colors
);
return
returnJson
;
}
}
src/main/java/com/diaoyun/zion/chinafrica/bis/impl/UniqloSpider.java
0 → 100644
浏览文件 @
43eddb1b
package
com
.
diaoyun
.
zion
.
chinafrica
.
bis
.
impl
;
import
com.diaoyun.zion.chinafrica.bis.IItemSpider
;
import
com.diaoyun.zion.chinafrica.enums.PlatformEnum
;
import
com.diaoyun.zion.master.util.HttpClientUtil
;
import
net.sf.json.JSONArray
;
import
net.sf.json.JSONObject
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.springframework.stereotype.Component
;
import
java.io.IOException
;
import
java.net.URISyntaxException
;
import
java.util.concurrent.ExecutionException
;
import
java.util.concurrent.TimeoutException
;
/**
* 优衣库数据爬虫
*
* TODO 图片路径未处理
* 图片路径为:"https://www.uniqlo.cn/hmall/test/" + 商品id + "/sku/40/" + 商品图片id + ".jpg"
*/
@Component
(
"uniqloSpider"
)
public
class
UniqloSpider
implements
IItemSpider
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
UniqloSpider
.
class
);
// 优衣库数据爬虫
private
static
final
String
uniqloUrl
=
"https://www.uniqlo.cn/data/products/spu/"
;
@Override
public
JSONObject
captureItem
(
String
targetUrl
)
throws
URISyntaxException
,
IOException
,
ExecutionException
,
InterruptedException
,
TimeoutException
{
// 获取商品 id
String
[]
split
=
targetUrl
.
split
(
"productCode="
);
String
pId
=
split
[
1
];
// 获取商品详情的json链接
targetUrl
=
uniqloUrl
+
"zh_CN/"
+
pId
+
".json"
;
// 获取网页内容
String
content
=
HttpClientUtil
.
getContentByUrl
(
targetUrl
,
PlatformEnum
.
UNIQLO
.
getValue
());
// 转换为 json
JSONObject
json
=
JSONObject
.
fromObject
(
content
);
// 商品id
String
pName
=
json
.
getJSONObject
(
"summary"
).
getString
(
"name"
);
// 商品价格
String
pPrice
=
json
.
getJSONObject
(
"summary"
).
getString
(
"originPrice"
);
// 格式化数据
JSONArray
rowsJson
=
json
.
getJSONArray
(
"rows"
);
JSONObject
returnJson
=
new
JSONObject
();
returnJson
.
put
(
"name"
,
pName
);
returnJson
.
put
(
"price"
,
pPrice
);
returnJson
.
elementOpt
(
"data"
,
rowsJson
);
return
returnJson
;
}
}
src/main/java/com/diaoyun/zion/chinafrica/bis/impl/ZaraSpider.java
0 → 100644
浏览文件 @
43eddb1b
package
com
.
diaoyun
.
zion
.
chinafrica
.
bis
.
impl
;
import
com.diaoyun.zion.chinafrica.bis.IItemSpider
;
import
com.diaoyun.zion.chinafrica.enums.PlatformEnum
;
import
com.diaoyun.zion.master.util.HttpClientUtil
;
import
net.sf.json.JSONArray
;
import
net.sf.json.JSONObject
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.springframework.stereotype.Component
;
import
java.io.IOException
;
import
java.net.URISyntaxException
;
import
java.util.HashMap
;
import
java.util.Map
;
import
java.util.concurrent.ExecutionException
;
import
java.util.concurrent.TimeoutException
;
/**
* Zara西班牙时尚品牌数据爬虫
* TODO 数据未处理完全
*/
@Component
(
"zaraSpider"
)
public
class
ZaraSpider
implements
IItemSpider
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ZaraSpider
.
class
);
//西班牙时尚品牌数据爬虫
private
static
final
String
zaraUrl
=
"https://www.zara.cn/cn/zh/"
;
@Override
public
JSONObject
captureItem
(
String
targetUrl
)
throws
URISyntaxException
,
IOException
,
ExecutionException
,
InterruptedException
,
TimeoutException
{
// 获取url中的网页内容
String
content
=
HttpClientUtil
.
getContentByUrl
(
targetUrl
,
PlatformEnum
.
ZARA
.
getValue
());
// 截取主要的 Json 内容
String
jsonDataStr
=
getDataJson
(
content
,
"dataLayer"
,
";window.zara.viewPayload"
)
.
replace
(
"dataLayer = "
,
""
);
// 转换为 Json 格式
JSONObject
jsonObject
=
JSONObject
.
fromObject
(
jsonDataStr
);
return
jsonObject
;
}
/**
* 根据首位字符串内容进行截取
* @param jsonStr
* @param startStr 起始字符串
* @param lastStr 结尾字符串(不包含)
* @return
*/
private
static
String
getDataJson
(
String
jsonStr
,
String
startStr
,
String
lastStr
)
{
int
startIndex
=
jsonStr
.
indexOf
(
startStr
);
int
lastIndex
=
jsonStr
.
lastIndexOf
(
lastStr
);
return
jsonStr
.
substring
(
startIndex
,
lastIndex
);
}
public
static
void
main
(
String
[]
args
)
throws
IOException
,
URISyntaxException
{
// URL链接
String
targetUrl
=
"https://www.zara.cn/cn/zh/%E5%BA%9C%E7%BB%B8%E9%95%BF%E7%89%88%E8%A1%AC%E8%A1%AB-p08053157.html?v1=31979171&v2=1319321"
;
// 获取网页内容
String
content
=
HttpClientUtil
.
getContentByUrl
(
targetUrl
,
PlatformEnum
.
ZARA
.
getValue
());
// 截取主要的商品数据
int
labelHeadIndex
=
content
.
indexOf
(
"dataLayer"
);
int
labelTailIndex
=
content
.
lastIndexOf
(
";window.zara.viewPayload"
);
String
abv
=
content
.
substring
(
labelHeadIndex
,
labelTailIndex
).
replace
(
"dataLayer = "
,
""
);
System
.
err
.
println
(
abv
);
// 转换为 Json 格式
JSONObject
json
=
JSONObject
.
fromObject
(
abv
);
// System.err.println(json);
// product 对象节点
JSONObject
responseData
=
json
.
getJSONObject
(
"product"
);
// System.err.println(responseData);
// detail 对象节点
JSONObject
details
=
responseData
.
getJSONObject
(
"detail"
);
// System.err.println(details);
// colors 数组节点
JSONArray
colorsArray
=
details
.
getJSONArray
(
"colors"
);
Map
<
Integer
,
JSONObject
>
colorMap
=
new
HashMap
<>();
for
(
int
i
=
0
;
i
<
colorsArray
.
size
();
i
++)
{
colorMap
.
put
(
i
,
colorsArray
.
getJSONObject
(
i
));
}
System
.
out
.
println
(
colorMap
);
// TODO 取出颜色属性
// sizes 数组节点
Map
<
Integer
,
JSONObject
>
sizesMap
=
new
HashMap
<>();
for
(
Map
.
Entry
<
Integer
,
JSONObject
>
entry
:
colorMap
.
entrySet
()){
// 遍历出每个 colors 对象节点
JSONObject
colors
=
entry
.
getValue
();
JSONArray
sizesArray
=
colors
.
getJSONArray
(
"sizes"
);
for
(
int
i
=
0
;
i
<
sizesArray
.
size
();
i
++)
{
sizesMap
.
put
(
i
,
sizesArray
.
getJSONObject
(
i
));
}
}
System
.
out
.
println
(
sizesMap
);
// TODO 取出价格和尺码属性
// TODO 取出图片属性(图片实体类未知)
}
}
src/main/java/com/diaoyun/zion/chinafrica/enums/PlatformEnum.java
浏览文件 @
43eddb1b
...
...
@@ -12,7 +12,10 @@ public enum PlatformEnum implements EnumItemable<PlatformEnum> {
TB
(
"淘宝"
,
"tb"
),
TM
(
"天猫"
,
"tm"
),
PULLANDBEAR
(
"Pullandbear"
,
"pullandbear"
),
GAP
(
"GAP"
,
"gap"
),
ZARA
(
"Zara"
,
"zara"
),
UNIQLO
(
"优衣库"
,
"uniqlo"
),
NIKE
(
"NIKE"
,
"nike"
),
UN
(
"未知"
,
"un"
);
...
...
src/main/java/com/diaoyun/zion/chinafrica/factory/ItemSpiderFactory.java
浏览文件 @
43eddb1b
...
...
@@ -20,10 +20,22 @@ public class ItemSpiderFactory {
iItemSpider
=
(
IItemSpider
)
SpringContextUtil
.
getBean
(
"tmItemSpider"
);
break
;
}
case
"pullandbear"
:{
iItemSpider
=
(
IItemSpider
)
SpringContextUtil
.
getBean
(
"pullandbearSpider"
);
break
;
}
case
"gap"
:{
iItemSpider
=
(
IItemSpider
)
SpringContextUtil
.
getBean
(
"gapItemSpider"
);
break
;
}
case
"zara"
:{
iItemSpider
=
(
IItemSpider
)
SpringContextUtil
.
getBean
(
"zaraSpider"
);
break
;
}
case
"uniqlo"
:{
iItemSpider
=
(
IItemSpider
)
SpringContextUtil
.
getBean
(
"uniqloSpider"
);
break
;
}
case
"nike"
:{
iItemSpider
=
(
IItemSpider
)
SpringContextUtil
.
getBean
(
"nikeItemSpider"
);
break
;
...
...
src/main/java/com/diaoyun/zion/chinafrica/service/impl/SpiderServiceImpl.java
浏览文件 @
43eddb1b
...
...
@@ -43,14 +43,20 @@ public class SpiderServiceImpl implements SpiderService {
private
PlatformEnum
judgeUrlType
(
String
targetUrl
)
{
PlatformEnum
platformEnum
=
PlatformEnum
.
UN
;
if
(
targetUrl
.
contains
(
"taobao.com"
)&&(
targetUrl
.
contains
(
"item.htm"
)||
targetUrl
.
contains
(
"detail.htm"
)))
{
platformEnum
=
PlatformEnum
.
TB
;
}
else
if
(
targetUrl
.
contains
(
"tmall.com/item.htm"
))
{
if
(
targetUrl
.
contains
(
"taobao.com"
)&&(
targetUrl
.
contains
(
"item.htm"
)||
targetUrl
.
contains
(
"detail.htm"
)))
{
platformEnum
=
PlatformEnum
.
TB
;
}
else
if
(
targetUrl
.
contains
(
"tmall.com/item.htm"
))
{
platformEnum
=
PlatformEnum
.
TM
;
}
else
if
(
targetUrl
.
contains
(
"www.gap.cn/pdp/"
))
{
platformEnum
=
PlatformEnum
.
GAP
;
}
else
if
(
targetUrl
.
contains
(
"www.nike.com/cn/t/"
))
{
platformEnum
=
PlatformEnum
.
NIKE
;
}
else
if
(
targetUrl
.
contains
(
"https://www.pullandbear.cn"
))
{
platformEnum
=
PlatformEnum
.
PULLANDBEAR
;
}
else
if
(
targetUrl
.
contains
(
"www.gap.cn/pdp/"
))
{
platformEnum
=
PlatformEnum
.
GAP
;
}
else
if
(
targetUrl
.
contains
(
"zara.cn"
))
{
platformEnum
=
platformEnum
.
ZARA
;
}
else
if
(
targetUrl
.
contains
(
"uniqlo.cn/product-detail.html"
))
{
platformEnum
=
platformEnum
.
UNIQLO
;
}
else
if
(
targetUrl
.
contains
(
"www.nike.com/cn/t/"
))
{
platformEnum
=
platformEnum
.
NIKE
;
}
return
platformEnum
;
}
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论