1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
| import requests from lxml import etree import json
class Tieba(object): def __init__(self, name): self.url = 'https://tieba.baidu.com/f?kw={}'.format(name) self.heardes = { "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)" }
def get_data(self, url): response = requests.get(url, headers=self.heardes) with open("temp.html", "wb") as f: f.write(response.content) return response.content
def parse_data(self, data): html = etree.HTML(data) el_list = html.xpath('//li[@class=" j_thread_list clearfix thread_item_box"]/div/div[2]/div[1]/div[1]/a') print(len(el_list))
data_list = [] for el in el_list: temp = {} temp['title'] = el.xpath("./text()")[0] temp['link'] = 'https://tieba.baidu.com' + el.xpath("./@href")[0] data_list.append(temp)
try: next_url = 'https:' + html.xpath('//a[contains(text(),"下一页>")]/@href')[0] except: next_url = None return data_list, next_url
def save_data(self, data_list): for data in data_list: with open("temp.json", "a+", encoding='utf-8') as f: data_temp = json.dumps(data, ensure_ascii=False) print(data_temp) f.write(data_temp)
def run(self): next_url = self.url
while True: data = self.get_data(self.url) data_list, next_url = self.parse_data(data)
self.save_data(data_list)
print(next_url) if next_url == None: break
if __name__ == '__main__': tieba = Tieba("华东理工大学吧") tieba.run()
|