赛题1
任务A:数据采集与处理
子任务一:数据采集
爬取酒店列表数据,例如酒店名称、地址、商圈、评分、评论数、类型,并且存入到hotel.csv文件中
import requests
from pandas import Series,DataFrame
headers_str = '''accept: application/json, text/plain, */*
Accept-Encoding: gzip, deflate, br
accept-language: zh-cn
appfrom: 16
cluster: idc
Connection: keep-alive
Cookie: soso_17u_tab_open_index=1; H5CookieId=2a31035b-81b0-452f-8af5-4bc2cb6d6229; firsttime=1693790207090; abtkey=037eed29-b152-4fc3-92a8-027faa2dffc6; _tcudid_v2=GVAglBHWXx1wFDh4_P4gvwe79p_DO7Vf2Yj7sQ9vCd4; nus=userid=682242716&nickName=%e5%90%8c%e7%a8%8b%e4%bc%9a%e5%91%98_0D043E7512A&level=1; __tctma=144323752.1693790169808694.1693790169437.1693790169437.1693793067529.2; hotel_lang=zh-cn; 17uCNRefId=RefId=0&SEFrom=&SEKeyWords=; TicketSEInfo=RefId=0&SEFrom=&SEKeyWords=; CNSEInfo=RefId=0&tcbdkeyid=&SEFrom=&SEKeyWords=&RefUrl=; qdid=-9999; businessLine=hotel; H5Channel=mnoreferseo%2CSEO; indate=2023-09-09; outdate=2023-09-10; route=e83eaebd8f07fc1b8cfab528aeb2900e; lasttime=1694239332948; JSESSIONID=A23D8BD953AE131A1233B6F406759B2E
deviceid: 2a31035b-81b0-452f-8af5-4bc2cb6d6229
Host: www.ly.com
Referer: https://www.ly.com/hotel/hotellist?pageSize=20&t=1694239332807&city=321&inDate=2023-09-09&outDate=2023-09-10&filterList=8888_1
sec-ch-ua-mobile: ?0
sec-ch-ua-platform: "Windows"
Sec-Fetch-Dest: empty
Sec-Fetch-Mode: cors
Sec-Fetch-Site: same-origin
timezone: 8
tmapi-client: tpc
traceid: d6d36af2-7f3a-4504-a125-db1cfa470c97
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'''
headers_dict = {}
line_list = headers_str.split('\n')
for line in line_list:
key = line.split(':')[0]
value = line.split(':')[1]
headers_dict[key] = value.strip()
# print(headers_dict)
hotel_name_list = []
address_list = []
shangquan_list = []
rate_list = []
comment_conut_list = []
leixing_list = []
commentMainTag_list = []
for i in range(100):
url = 'https://www.ly.com/tapi/v2/list?pageSize=20&t=1694239311116&city=321&inDate=2023-09-09&outDate=2023-09-10&filterList=8888_1&pageIndex='+ str(i) +'&sugActInfo='
res = requests.get(url,headers=headers_dict)
res_json = res.json()
hotel_list = res_json['data']['hotelList']
for hotel in hotel_list:
hotelName = hotel['hotelName']
hotel_name_list.append(hotelName)
hotelAddress = hotel['hotelAddress']
address_list.append(hotelAddress)
areaName = hotel['areaName']
shangquan_list.append(areaName)
commentScore = hotel['commentScore']
rate_list.append(commentScore)
commentCount = hotel['commentCount']
comment_conut_list.append(commentCount)
starLevelDes = hotel['starLevelDes']
leixing_list.append(starLevelDes)
commentMainTag = hotel['commentMainTag']
commentMainTag_list.append(commentMainTag)
hotel_dict = {}
hotel_dict['hotel_name'] = hotel_name_list
hotel_dict['address'] = address_list
hotel_dict['shangquan'] = shangquan_list
hotel_dict['rate'] = rate_list
hotel_dict['comment_conut'] = comment_conut_list
hotel_dict['leixing'] = leixing_list
hotel_dict['commentMainTag'] = commentMainTag_list
# print(hotel_dict)
df1 = DataFrame(data = hotel_dict)
df1.to_csv('hotel_new.csv')
from selenium import webdriver # 从selenium库中调用webdriver模块
from selenium.webdriver.common.by import By
import time,re
driver=webdriver.Chrome()
driver.get('https://www.ly.com/hotel/hotellist?pageSize=20&t=1693792521684&city=321&inDate=2023-09-04&outDate=2023-09-05&filterList=8888_1') # 访问页面
hotel_name_list = []
address_list = []
shangquan_list = []
rate_list = []
comment_conut_list = []
leixing_list = []
for i in range(3):
time.sleep(6) # 暂停,等待浏览器缓冲
element = driver.find_element(By.LINK_TEXT,'下一页')
element.click()
# 酒店名
pattern=re.compile('class="name">(.*?)</span>')
hotel_name= re.findall(pattern,driver.page_source)
hotel_name_list += hotel_name
# 地址 <p data-v-883905b8="" class="position"><span data-v-883905b8="">东方路8号<!----></span><!---->
pattern=re.compile('<p data-v-883905b8="" class="position"><span data-v-883905b8="">(.*?)<!----></span><!---->')
address = re.findall(pattern,driver.page_source)
address_list += address
# 商圈 <span data-v-883905b8="" style="color: rgb(255, 136, 0); margin-left: 5px;">近浦东大道地铁站 · 浦东陆家嘴金融贸易区</span>
pattern=re.compile('margin-left: 5px;">(.*?)</span>')
shangquan = re.findall(pattern,driver.page_source)
shangquan_list += shangquan
# 评分<p data-v-883905b8="" class="score mb5"><em data-v-883905b8="">4.7</em> 很好 </p>
pattern=re.compile('<p data-v-883905b8="" class="score mb5"><em data-v-883905b8="">(.*?)</em>')
rate = re.findall(pattern,driver.page_source)
rate_list += rate
# 评论数<p data-v-883905b8="" class="comment mb10">共3001条点评</p>
pattern = re.compile('<p data-v-883905b8="" class="comment mb10">(.*?)</p>')
comment_conut = re.findall(pattern,driver.page_source)
comment_conut_list += comment_conut
# 类型,<em data-v-883905b8="" class="starLevelStr">高档型</em>
pattern = re.compile('<em data-v-883905b8="" class="starLevelStr">(.*?)</em>')
leixing = re.findall(pattern,driver.page_source)
leixing_list += leixing
print(hotel_name_list)
print(len(hotel_name_list))
print(address_list)
print(len(address_list))
print(shangquan_list)
print(len(shangquan_list))
print(rate_list)
print(len(rate_list))
print(comment_conut_list)
print(len(comment_conut_list))
print(leixing_list)
print(len(leixing_list))
hotel_dict = {}
hotel_dict['hotel_name'] = hotel_name_list
hotel_dict['address'] = address_list
hotel_dict['shangquan'] = shangquan_list
hotel_dict['rate'] = rate_list
hotel_dict['comment_conut'] = comment_conut_list
hotel_dict['leixing'] = leixing_list
df1 = DataFrame(data = hotel_dict)
df1.to_csv('hotel.csv')
子任务二:数据处理
先读取文件,因为我们的数据中没有缺失的,手动制造一点
file = pd.read_csv('hotel.csv')
file['hotel_name'][0] = np.nan
file['rate'][4] = np.nan
print(file)
- 删除hotel.csv中酒店名称为空的数据并且存入hotel2_c1.csv;打印出被删除的数据。
# 删除hotel.csv中酒店名称为空的数据并且存入hotel2_c1.csv;
# inplace=True可以删除原数据
# how='any'是删除有任意字段为空的行或者列
# axis=0表示删除行
df1 = file.dropna(axis=0,subset=["hotel_name"])
# 记得index要设为False,不然又会生成一个新的index
# mode可以设置追加或者覆盖等等
df1.to_csv('hotel2_c1.csv',index=False,mode='w')
# 打印出被删除的数据。
print(file[file['hotel_name'].isnull()])
- 删除hotel2.csv中删除数据源中缺失值大于3个字段的数据记录并且存入hotel2_c2.csv;
# 删除hotel2.csv中删除数据源中缺失值大于3个字段的数据记录并且存入hotel2_c2.csv;
# thresh参数用于指定每行(或每列)至少需要具有非空值的数量
# thresh参数接受一个整数值,表示允许的最小非空值数量。
# 如果行(或列)中的非空值数量低于thresh值,则该行(或列)将被删除。
# df.shape[1]返回DataFrame的列数
df2 = file.dropna(thresh=file.shape[1]-3)
df2.to_csv('hotel2_c2.csv',index=False,mode='w')
# 打印出被删除的数据。
# any()返回的是一个布尔值,只有行/列有空值,即为True
# sum()则可以返回该行/列具体有几个空值
deleted_data = file[file.isnull().sum(axis=1) > 3]
print(deleted_data)
- 将hotel2.csv中评分为空的数据设置为0并且存入hotel2_c3.csv;
# 将hotel2.csv中评分为空的数据设置为0并且存入hotel2_c3.csv;
# 注意fillna是不能通过subset去指定列
df3 = file.copy() # 注意:这里不要动原数据,但是不要直接赋值,直接赋值是引用
# 以下方法二选一,推荐inplace
# df3['rate'] = df3['rate'].fillna(0)
df3['rate'].fillna(0,inplace=True)
df3.to_csv('hotel2_c3.csv',index=False,mode='w')
# 打印出被改变的数据
deleted_data = file[file['rate'].isnull()]
print(deleted_data)
- 将hotel2.csv中评分为空的数据设置为平均评分并且存入hotel2_c4.csv。
rate_mean = file.describe().loc['mean','rate']
# 将hotel2.csv中评分为空的数据设置为0并且存入hotel2_c3.csv;
# 注意fillna是不能通过subset去指定列
df3 = file.copy() # 注意:这里不要动原数据,但是不要直接赋值,直接赋值是引用
# 以下方法二选一,推荐inplace
# df3['rate'] = df3['rate'].fillna(0)
df3['rate'].fillna(rate_mean,inplace=True)
df3.to_csv('hotel2_c4.csv',index=False,mode='w')
# 打印出被改变的数据
deleted_data = file[file['rate'].isnull()]
# print(deleted_data)
任务B:数据分析与可视化
子任务一:数据分析
城市游客接纳能力是城市规划建设中的重要指标,其中城市的酒店房间数量是城市游客接纳能力的关键要素。请编写程序或脚本根据酒店管理网站中的数据hotel.csv统计以下的相关信息,具体要求如下:
1、 酒店一共有几种档次,分别统计其酒店总数;统计各种档次酒店的平均评分和总的评论数。(如果数据中有缺失则填充为0)
file = pd.read_csv('hotel.csv')
file['hotel_name'][0] = np.nan
file.loc[3] = np.nan
file['rate'][4] = np.nan
# 酒店总数
print(file.groupby('leixing').groups)
print(file.groupby('leixing').sum())
2、 统计各种档次酒店的平均评分
# 各种档次酒店的平均评分
rate_mean = DataFrame(file.groupby('leixing')['rate'].mean())
print(rate_mean)
3、 统计各种档次酒店的总的评论数
# 总的评论数
# 先把缺失数据弄成0
file['comment_conut'] = file['comment_conut'].fillna('共0条')
# 然后处理数据为整数类型
pattern = r'共(.*?)条'
file['comment_conut'] = file['comment_conut'].str.extract(pattern).astype(int)
# 统计
comment_count_mean = DataFrame(file.groupby('leixing')['comment_conut'].sum())
print(comment_count_mean)
子任务二:数据可视化
在企业消费平台上,各地区的酒店信息能够反映一个地区商业活动的密集程度。例如酒店总量多的城市大都具有强烈的吸纳外来人员的能力,订单数量能够反映该地区的有较多的商业往来。根据现有数据及给定参数完成酒店数据统计。
使用Python代码编写数据可视化的相关功能,数据分析业务所用数据为hotel.csv数据,具体要求如下:
1、 用柱状图显示各类档次的酒店总数;
file = pd.read_csv('hotel.csv')
file['hotel_name'][0] = np.nan
file.loc[3] = np.nan
file['rate'][4] = np.nan
# 指定字体为SimHei
matplotlib.rcParams['font.sans-serif'] = 'SimHei'
# 酒店总数
print(file.groupby('leixing').groups)
print(file.groupby('leixing').sum()['rate'])
# x轴默认是s的索引
# y轴默认是s的值
file.groupby('leixing').sum()['rate'].plot(kind='bar')
# # 显示图形
plt.show()
2、 用折线图显示各档次酒店平均评分走势;
# 指定字体为SimHei
matplotlib.rcParams['font.sans-serif'] = 'SimHei'
# 酒店总数
print(file.groupby('leixing').groups)
print(file.groupby('leixing')['rate'].mean())
# x轴默认是s的索引
# y轴默认是s的值
file.groupby('leixing')['rate'].mean().plot()
# # 显示图形
plt.show()
3、 用饼图显示各档次酒店数占比。
# 指定字体为SimHei
matplotlib.rcParams['font.sans-serif'] = 'SimHei'
# 统计各类的占比
count = file['leixing'].value_counts()
print(count)
# x轴默认是s的索引
# y轴默认是s的值
count.plot(kind='pie')
# # 显示图形
plt.show()