1、爬取Buff饰品名称与价格
class BuffCsgo:
def __init__(self, category, save_file_path, _, price_range):
# 貌似是一个时间戳类似的玩意 暂且放着
self._ = _
# 类目
self.category = category
# 存储位置
self.save_file_path = save_file_path
# 价格区间 从10起步
self.price_range = price_range
# 一个临时的存储当前页信息的列表
self.item_datas = []
# 不变的url
self.base_url = 'https://buff.163.com/api/market/goods?'
# buff要获取当前类目总页码必须点最后一页才能得到正确的页码数
def get_total_page(self):
params = {
'game': 'csgo',
'page_num': 2000,
'category_group': self.category,
'min_price': 10,
'max_price': 10 + self.price_range,
'_': self._
}
url = self.base_url + urlencode(params)
response = requests.get(url=url, headers=self.init_headers(), timeout=10)
if response.json().get('data'):
self._ += random.randint(1, 3)
total_page = response.json().get('data').get('total_page')
return total_page
def get_page(self):
for page in range(1,self.get_total_page()+1):
#for page in range(1, 4): # 测试
params = {
'game': 'csgo',
'page_num': page,
'category_group': self.category,
'min_price': 10,
'max_price': 10 + self.price_range,
'_': self._
}
current_url = self.base_url + urlencode(params)
try:
response = requests.get(url=current_url, headers=self.init_headers(), proxies = self.random_ip(),timeout=10)
if response.status_code == 200:
print(f'已获取第{page}页')
self._ += random.randint(1, 3)
page_text = response.json()
self.parse_page(page_text)
self.save_to_csv()
time.sleep(random.random() * 5)
except requests.ConnectionError as e:
print('获取失败')
def parse_page(self, page_text):
if page_text.get('data').get('items'):
for item in page_text.get('data').get('items'):
info = {}
info['饰品名称'] = item.get('name')
info['Buff当前价格'] = item.get('quick_price')
info['Buff当前在售数量'] = item.get('sell_num')
self.item_datas.append(info)
def init_headers(self):
cookie = 'yourcookie'
headers = {
'User-Agent': UserAgent().random,
'Cookie': cookie
}
return headers
# 随机取ip
def random_ip(self):
proxies = [
'120.232.150.110:80',
'106.45.221.69:3256',
'47.98.208.18:8080',
'117.24.80.59:3256',
'111.179.73.203:3256',
'47.95.178.212:3128',
'125.87.84.82:3256',
'47.98.179.39:8080',
'116.62.113.142:1081',
'114.215.172.136:31280',
'47.98.183.59:3128',
'118.194.242.184:80',
'114.67.108.243:8081',
'120.232.150.100:80'
]
proxy = {
'http': 'http://' + random.choice(proxies)
}
return proxy
# 存储到csv
def save_to_csv(self):
df = pd.DataFrame(self.item_datas)
df = df.reindex(columns=['饰品名称', 'Buff当前价格', 'Buff当前在售数量'])
if os.path.exists(self.save_file_path) and os.path.getsize(self.save_file_path):
df.to_csv(self.save_file_path, mode='a', encoding='utf-8', header=None, index=False)
else:
df.to_csv(self.save_file_path, mode='a', encoding='utf-8', index=False)
print('已创建' + self.save_file_path)
self.item_datas = []
for category in ['pistol', 'rifle', 'smg', 'shotgun', 'machinegun']: #刀拳套贴花没爬:
save_file_path = '{}.csv'.format(category)
B = BuffCsgo(category, save_file_path, 1614323440986, 200)
total_page = B.get_total_page()
print(f'当前{category}类目共有{total_page}页')
B.get_page()
2、爬取Steam饰品名称与价格
class SteamCsgo:
def __init__(self, start, save_file_path, page_num):
# 确认起始爬取值 第几个商品
self.start = start
# 确认要爬多少页
self.page_num = page_num
# 确认存储位置
self.save_file_path = save_file_path
# 初始化数据列表
self.item_datas = []
# 定义url前头
self.base_url = 'https://steamcommunity.com/market/search/render/?query=&'
def get_page(self):
count = 0
for page in range(self.page_num):
params = {
'start': self.start + 100 * page,
'count': 100,
'search_descriptions': 0,
'sort_column': 'price',
'sort_dir': 'asc',
'appid': 730
}
current_url = 'https://steamcommunity.com/market/search/render/?query=&' + urlencode(params)
try:
requests.DEFAULT_RETRIES = 5 # 增加重试连接次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
res = requests.get(url=current_url, headers=self.init_headers())
if res.status_code == 200:
count = count + 1
if count % 10 == 0:
time.sleep(60)
print('已成功获取第{}页'.format(page + 1))
page_info = res.json()
self.parse_page(page_info)
self.save_to_csv()
time.sleep(random.random() * 15)
else:
print('失败')
except requests.ConnectionError as e:
print(e)
print('{}页获取失败'.format(page))
return None
def parse_page(self, page_info):
page_html = page_info['results_html'].replace('\r', '').replace('\n', '').replace('\t', '')
tree = etree.HTML(page_html)
for i in tree.xpath('//a[@class="market_listing_row_link"]'):
info = {}
info['饰品名称'] = i.xpath('.//span[@class="market_listing_item_name"]/text()')[0] # 名称
info['饰品价格'] = i.xpath('.//span[@class="normal_price"]/text()')[0] # 起价
info['当前在售数量'] = i.xpath('.//span[@class="market_listing_num_listings_qty"]/@data-qty')[0] # 当前在售数量
self.item_datas.append(info)
# 存储到csv
def save_to_csv(self):
df = pd.DataFrame(self.item_datas)
df = df.reindex(columns=['饰品名称', '饰品价格', '当前在售数量'])
if os.path.exists(self.save_file_path) and os.path.getsize(self.save_file_path):
df.to_csv(self.save_file_path, mode='a', encoding='utf-8', header=None, index=False)
else:
df.to_csv(self.save_file_path, mode='a', encoding='utf-8', index=False)
print('已创建' + self.save_file_path)
self.item_datas = []
# 生成随机ua
def init_headers(self):
headers = {
'User-Agent': UserAgent().random,
'Accept-Language': 'zh-CN',
# 'Referer': 'https: // steamcommunity.com / market / search?appid = 730'
}
return headers
if __name__ == '__main__':
S = SteamCsgo(6500, './CsgoSteam.csv', 60)
S.get_page()
3、利用上述两脚本获取到的数值再次进行比较并打印输出
df_steam = pd.read_csv('./CsgoSteam.csv')
df_buffrifle = pd.read_csv('./rifle.csv')
df_buffsmg = pd.read_csv('./smg.csv')
df_buffshotgun = pd.read_csv('./shotgun.csv')
df_buffmachinegun = pd.read_csv('./machinegun.csv')
#拼接buff的数据
df_buff = pd.read_csv('./pistol.csv')
df_buff = pd.concat((df_buff,df_buffrifle,df_buffshotgun,df_buffsmg,df_buffmachinegun))
df_buff = df_buff[df_buff['Buff当前在售数量']>=100] #筛选大于100在售
#只选择steam中在售数量>=100的
df_steam = df_steam[df_steam['当前在售数量']>=100]
#横向拼接两组数据中相同名称的行
df = pd.merge(df_steam,df_buff,how='outer')
df = df.dropna()
df.drop_duplicates()
#自定义函数以找出steam当前价格中的数字
def find_nums(s):
return re.findall(r"\d+\.?\d*",str(s))[0]
df['饰品价格'] = df['饰品价格'].map(find_nums)
df["饰品价格"] = pd.to_numeric(df["饰品价格"],errors='coerce')
#获取当前美元汇率
def get_rate():
url = 'https://www.huilv.cc/USD_CNY/'
response = requests.get(url = url).text
tree = etree.HTML(response)
rate = float(tree.xpath('//*[@id="main"]/div[1]/div[2]/span[1]/text()')[0])
return rate
#得出倒卖比并排序
rate = get_rate()
df['steam当前可获得收益'] = df['饰品价格']*rate*0.85
df['倒卖比'] = df['Buff当前价格'] / df['steam当前可获得收益']
df.sort_values(by = '倒卖比').drop_duplicates()
评论区