mirror of
https://github.com/bvanroll/yahooBot.git
synced 2025-08-29 12:02:49 +00:00
finished for today, wrapping up
This commit is contained in:
@@ -5,21 +5,30 @@ from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
from pymongo import MongoClient
|
||||
from pymongo.errors import ConnectionFailure, ConfigurationError, OperationFailure
|
||||
|
||||
|
||||
|
||||
|
||||
#universal vars
|
||||
|
||||
# debug level
|
||||
frm = "%(asctime)-15s %(message)s"
|
||||
|
||||
lvl = logging.WARNING
|
||||
lvl = logging.DEBUG
|
||||
|
||||
|
||||
|
||||
|
||||
#logging.basicConfig(level=logging.DEBUG)
|
||||
#logging.basicConfig(level=lvl,format=frm,file="log")
|
||||
logging.basicConfig(level=lvl,format=frm,filename="log")
|
||||
|
||||
|
||||
|
||||
#if you want to use mongodb, use this order of
|
||||
|
||||
def getReq(url):
|
||||
try:
|
||||
with closing(get(url, stream=True)) as resp:
|
||||
@@ -38,23 +47,59 @@ def checkResponse(resp):
|
||||
and resp.headers['Content-Type'].lower() is not None
|
||||
and resp.headers['Content-Type'].lower().find('html') > -1)
|
||||
|
||||
raw_html = getReq("https://answers.yahoo.com/dir/index")
|
||||
html = BeautifulSoup(raw_html, "html.parser")
|
||||
|
||||
def main():
|
||||
|
||||
|
||||
items = []
|
||||
raw_html = getReq("https://answers.yahoo.com/dir/index")
|
||||
html = BeautifulSoup(raw_html, "html.parser")
|
||||
|
||||
|
||||
for li in html.find_all('li', class_="ya-discover-tile ya-discover-tile-qn Bfc P-14 Bdbx-1g Bgc-w"):
|
||||
arr = {}
|
||||
arr['title'] = ' '.join(li.find('a', class_="title").text.split())
|
||||
arr['category'] = ' '.join(li.find("div", class_="Clr-888 Fz-12 Lh-18").find('a', class_="Clr-b").text.split())
|
||||
arr['description'] = ' '.join(li.find('div', class_="fullDesc Mah-130 Ovy-s Fz-13 Lh-18 Ol-n D-n").text.replace('\n', '').split())
|
||||
if arr['description'].find('Best answer:') > -1:
|
||||
arr['description'] = None
|
||||
|
||||
items.append(arr)
|
||||
items = []
|
||||
|
||||
|
||||
with open('data/'+time.asctime().replace(":","-").replace(" ", "_"), "w") as f:
|
||||
f.write(json.dumps(items))
|
||||
for li in html.find_all('li', class_="ya-discover-tile ya-discover-tile-qn Bfc P-14 Bdbx-1g Bgc-w"):
|
||||
arr = {}
|
||||
arr['title'] = ' '.join(li.find('a', class_="title").text.split())
|
||||
arr['category'] = ' '.join(li.find("div", class_="Clr-888 Fz-12 Lh-18").find('a', class_="Clr-b").text.split())
|
||||
arr['description'] = ' '.join(li.find('div', class_="fullDesc Mah-130 Ovy-s Fz-13 Lh-18 Ol-n D-n").text.replace('\n', '').split())
|
||||
if arr['description'].find('Best answer:') > -1:
|
||||
arr['description'] = None
|
||||
|
||||
items.append(arr)
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
try:
|
||||
client = MongoClient(sys.argv[1])
|
||||
logging.info("connected to db")
|
||||
db = client.yahooAnswers
|
||||
logging.debug("selecting collection: questions")
|
||||
posts = db.questions
|
||||
logging.debug("selecting posts")
|
||||
for i in items:
|
||||
result = posts.insert_one(i)
|
||||
logging.debug("inserted an item: " + str(result))
|
||||
except ConnectionFailure as e:
|
||||
logging.warning('Connection Problems: ' + str(e))
|
||||
except ConfigurationError as e:
|
||||
logging.critical('INVALID CREDENTIALS OR INVALID CONFIGURATION: ' + str(e))
|
||||
except OperationFailure as e:
|
||||
logging.critical('Operation Failure: ' + str(e))
|
||||
else:
|
||||
with open('data/'+time.asctime().replace(":","-").replace(" ", "_"), "w") as f:
|
||||
f.write(json.dumps(items))
|
||||
f.close()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
Reference in New Issue
Block a user