首页 > 专利查询

python爬取知乎完整版

阅读：评论：0

python爬取知乎完整版

由于在爬取知乎是在伯乐在线之后的，需要参考前⾯的代码，有相同的部分就没有再加⼊。

在zhihu.py 中

import scrapy

import re

from urllib import parse

from selenium import webdriver

from scrapy.http import Request

from scrapy.loader import ItemLoader

import time

混凝土防冻剂配方import pickle

import datetime

import json

from ArticleSpider.items import ZhihuQuestionItem, ZhihuAnswerItem

try:

import urlparse as parse

except:

from urllib import parse

class ZhihuSpider(scrapy.Spider):

name = 'zhihu'

allowed_domains = ['']

start_urls = ['/']

#question的第⼀页answer的请求url

start_answer_url = "/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comm # start_answer_url ="/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Cc # start_answer_url="/api/v4/questions/22044254/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_deta headers = {

"HOST": "",

"Referer": '',

'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36"

}

custom_settings = {

"COOKIES_ENABLED": True

}

def parse(self, response):

"""

提取出html页⾯中的所有url 并跟踪这些url进⾏⼀步爬取

如果提取的url中格式为 /question/xxx 就下载之后直接进⼊解析函数

"""

摩托车雨棚

all_urls = response.css("a::attr(href)").extract()

all_urls = [parse.urljoin(response.url, url) for url in all_urls]

#使⽤lambda函数对于每⼀个url进⾏过滤，如果是true放回列表，返回false去除。

all_urls = filter(lambda x:True if x.startswith("https") else False, all_urls)

for url in all_urls:

match_obj = re.match("(.*/question/(\d+))(/|$).*", url)

if match_obj:

# 如果提取到question相关的页⾯则下载后交由提取函数进⾏提取

request_url = up(1)led探照灯

yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question)

else:

# 如果不是question页⾯则直接进⼀步跟踪

yield scrapy.Request(url, headers=self.headers, callback=self.parse)电力电子电容器

def parse_question(self, response):

#处理question页⾯，从页⾯中提取出具体的question item

# if "QuestionHeader-title" :

# #处理新版本

# match_obj = re.match("(.*/question/(\d+))(/|$).*", response.url)

# if match_obj:

# question_id = int(up(2))

# item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)

# item_loader.add_css("title", "h1.QuestionHeader-title::text")

# item_loader.add_css("content", ".QuestionHeader-detail span::text")

# item_loader.add_value("url", response.url)

# item_loader.add_value("zhihu_id", question_id)

# item_loader.add_css("answer_num", ".Question-mainColumn a::text")

# item_loader.add_css("comments_num", ".QuestionHeader-Comment span::text")

# item_loader.add_css("watch_user_num", ".QuestionFollowStatus-counts strong::text")

# item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")

# question_item = item_loader.load_item()

# else:

#处理⽼版本页⾯的item提取

match_obj = re.match("(.*/question/(\d+))(/|$).*", response.url)

if match_obj:

question_id = int(up(2))

item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)

# item_loader.add_css("title", ".zh-question-title h2 a::text")

# item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")

# item_loader.add_css("content", "#zh-question-detail")

# item_loader.add_value("url", response.url)

# item_loader.add_value("zhihu_id", question_id)

# item_loader.add_css("answer_num", "#zh-question-answer-num::text")

# item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")

# # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")

# item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")

# item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

item_loader.add_css("title", ".QuestionHeader-title::text")

item_loader.add_css("content", ".QuestionHeader-detail span::text")

item_loader.add_value("url", response.url)

item_loader.add_value("zhihu_id", question_id)

item_loader.add_css("answer_num", ".List-headerText span::text")

item_loader.add_css("comments_num", ".QuestionHeader-Comment span::text")

item_loader.add_css("watch_user_num", ".QuestionFollowStatus-counts strong::text")

item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")

question_item = item_loader.load_item()

yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)

yield question_item

def parse_answer(self, reponse):

#处理question的answer

ans_json = json.)

is_end = ans_json["paging"]["is_end"]

next_url = ans_json["paging"]["next"]

#提取answer的具体字段

for answer in ans_json["data"]:

answer_item = ZhihuAnswerItem()

answer_item["zhihu_id"] = answer["id"]

answer_item["url"] = answer["url"]

answer_item["question_id"] = answer["question"]["id"]

answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None

answer_item["content"] = answer["content"] if "content" in answer else None

answer_item["parise_num"] = answer["voteup_count"]

answer_item["comments_num"] = answer["comment_count"]

answer_item["create_time"] = answer["created_time"]

answer_item["update_time"] = answer["question"]["updated_time"]

answer_item["crawl_time"] = w()

yield answer_item

if not is_end:

yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)

def start_requests(self):

browser = webdriver.Chrome(executable_path="D:/")

# ("/signup?next=%2F")

<("/signin")

browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys("account")#输⼊正确的账户

browser.find_element_by_css_selector(".SignFlow-password input").send_keys("password")#输⼊正确的密码

print(browser.page_source)

browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()

time.sleep(10)

_cookies()

# print(Cookies)

cookie_dict={}

for cookie in Cookies:

f=open('C:/Users/Dell/scrapytest/Scripts/ArticleSpider'+cookie['name']+'.zhihu','wb')

pickle.dump(cookie,f)

f.close()

cookie_dict[cookie['name']]=cookie['value']

browser.close()

return[scrapy.Request(url=self.start_urls[0], headers=self.headers,dont_filter=True,cookies=cookie_dict)]

　在main.py

dline import execute

import sys

import os

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

# execute(["scrapy", "crawl", "jobbole"])

execute(["scrapy", "crawl", "zhihu"])

在items中

import re

from scrapy.loader import ItemLoader

from scrapy.loader.processors import MapCompose, TakeFirst, Join

from ArticleSpider.settings import SQL_DATETIME_FORMAT, SQL_DATE_FORMAT

from import extract_num

from w3lib.html import remove_tags

class ArticlespiderItem(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

pass

def date_convert(value):

try:

create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()

except Exception as e:

create_date = w().date()

return create_date

def get_nums(value):

match_re = re.match(".*?(\d+).*", value)

if match_re:

nums = int(up(1))

else:

nums = 0

return nums

def remove_comment_tags(value):

#去掉tag中提取的评论

if "评论" in value:

return ""

else:

return value

def return_value(value):

return value

def exclude_none(value):

if value:

return value

else:

value = "⽆"

return value

class ZhihuQuestionItem(scrapy.Item):

#知乎的问题 item

zhihu_id = scrapy.Field()

topics = scrapy.Field()

url = scrapy.Field()

title = scrapy.Field()

content = scrapy.Field(

input_processor=MapCompose(exclude_none),

)

answer_num = scrapy.Field()

comments_num = scrapy.Field()

watch_user_num = scrapy.Field()

click_num = scrapy.Field()

crawl_time = scrapy.Field()

def get_insert_sql(self):

#插⼊知乎question表的sql语句

insert_sql = """

insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num,

watch_user_num, click_num, crawl_time

)

VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)

ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num),

watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)

"""

zhihu_id = self["zhihu_id"][0]

topics = ",".join(self["topics"])

url = self["url"][0]

title = "".join(self["title"])

# content = "".join(self["content"])

# answer_num = extract_num("".join(self["answer_num"]))

# # comments_num = extract_num("".join(self["comments_num"]))

try:

content = "".join(self["content"])

except BaseException:

content = "⽆"

try:

answer_num = extract_num("".join(self["answer_num"]))

except BaseException:

answer_num = 0

comments_num = extract_num("".join(self["comments_num"]))

if len(self["watch_user_num"]) == 2:

watch_user_num = int(self["watch_user_num"][0])

click_num = int(self["watch_user_num"][1])

else:

watch_user_num = int(self["watch_user_num"][0])

click_num = 0

crawl_time = w().strftime(SQL_DATETIME_FORMAT)

params = (zhihu_id, topics, url, title, content, answer_num, comments_num,

watch_user_num, click_num, crawl_time)

return insert_sql, params

class ZhihuAnswerItem(scrapy.Item):

#知乎的问题回答item

zhihu_id = scrapy.Field()

url = scrapy.Field()

question_id = scrapy.Field()

author_id = scrapy.Field()

content = scrapy.Field()

短期负荷预测parise_num = scrapy.Field()

comments_num = scrapy.Field()

create_time = scrapy.Field()

update_time = scrapy.Field()

crawl_time = scrapy.Field()

def get_insert_sql(self):

#插⼊知乎回答表的sql语句

insert_sql = """

insert into zhihu_answer(zhihu_id, url, question_id, author_id, content, parise_num, comments_num,

全自动探针台create_time, update_time, crawl_time

)

VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)

ON DUPLICATE KEY UPDATE content=VALUES(content), comments_num=VALUES(comments_num), parise_num=VALUES(parise_num), update_time=VALUES(update_time)

"""

create_time = datetime.datetime.fromtimestamp(self["create_time"]).strftime(SQL_DATETIME_FORMAT)

update_time = datetime.datetime.fromtimestamp(self["update_time"]).strftime(SQL_DATETIME_FORMAT)

params = (

self["zhihu_id"], self["url"], self["question_id"],

self["author_id"], self["content"], self["parise_num"],

self["comments_num"], create_time, update_time,

self["crawl_time"].strftime(SQL_DATETIME_FORMAT),

)

return insert_sql, params

在setting中

import os

BOT_NAME = 'ArticleSpider'

SPIDER_MODULES = ['ArticleSpider.spiders']

NEWSPIDER_MODULE = 'ArticleSpider.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'ArticleSpider (+)'

# rules

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See /en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

# 'Accept-Language': 'en',

# Enable or disable spider middlewares

# See /en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

# 'ArticleSpider.middlewares.ArticlespiderSpiderMiddleware': 543,

# Enable or disable downloader middlewares

# See /en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

# 'ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware': 543,

# Enable or disable extensions

# See /en/latest/topics/extensions.html

#EXTENSIONS = {

# 'lnet.TelnetConsole': None,

# Configure item pipelines

# See /en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

'ArticleSpider.pipelines.ArticlespiderPipeline': 300,

# 'scrapy.pipelines.images.ImagesPipeline': 1,

# 'ArticleSpider.pipelines.ArticleImagePipeline':1,

'ArticleSpider.pipelines.JsonExporterPipeline':2,

# 'ArticleSpider.pipelines.MysqlPipeline': 4,

'ArticleSpider.pipelines.MysqlTwistedPipline': 1,

}

IMAGES_URLS_FIELD = "front_image_url"

project_dir = os.path.abspath(os.path.dirname(__file__))

IMAGES_STORE = os.path.join(project_dir, 'images')

import sys

BASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))

sys.path.insert(0, os.path.join(BASE_DIR, 'ArticleSpider'))

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36" # Enable and configure the AutoThrottle extension (disabled by default)

# See /en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See /en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'sions.httpcache.FilesystemCacheStorage'

MYSQL_HOST = "localhost"

MYSQL_DBNAME = "article_spider"

MYSQL_USER = "root"

MYSQL_PASSWORD = "123456"

SQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"

SQL_DATE_FORMAT = "%Y-%m-%d"

本文发布于:2023-05-18 14:58:23，感谢您对本站的认可！

本文链接：https://patent.en369.cn/patent/4/104556.html

上一篇：linuxefi启动原理,Linux（RHEL6）启动过程详解

下一篇：Maven工程下Java加载so包nosuitableimagefoundunknownf。。。

标签：提取知乎函数参考回答返回列表防冻剂

留言与评论（共有 0 条评论）