Files
yahooBot/webscraper/main.py

106 lines
2.7 KiB
Python

from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import logging
import json
import time
import sys
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure, ConfigurationError, OperationFailure
#universal vars
# debug level
frm = "%(asctime)-15s %(message)s"
lvl = logging.DEBUG
#logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=lvl,format=frm,filename="log")
#if you want to use mongodb, use this order of
def getReq(url):
try:
with closing(get(url, stream=True)) as resp:
if checkResponse(resp):
return resp.content
else:
return None
except RequestException as e:
logging.warning('error during requests to {} : {}'.format(url, str(e)))
return None
def checkResponse(resp):
return (resp.status_code == 200
and resp.headers['Content-Type'].lower() is not None
and resp.headers['Content-Type'].lower().find('html') > -1)
def main():
raw_html = getReq("https://answers.yahoo.com/dir/index")
html = BeautifulSoup(raw_html, "html.parser")
items = []
for li in html.find_all('li', class_="ya-discover-tile ya-discover-tile-qn Bfc P-14 Bdbx-1g Bgc-w"):
arr = {}
arr['title'] = ' '.join(li.find('a', class_="title").text.split())
arr['category'] = ' '.join(li.find("div", class_="Clr-888 Fz-12 Lh-18").find('a', class_="Clr-b").text.split())
arr['description'] = ' '.join(li.find('div', class_="fullDesc Mah-130 Ovy-s Fz-13 Lh-18 Ol-n D-n").text.replace('\n', '').split())
if arr['description'].find('Best answer:') > -1:
arr['description'] = None
items.append(arr)
if len(sys.argv) > 1:
try:
client = MongoClient(sys.argv[1])
logging.info("connected to db")
db = client.yahooAnswers
logging.debug("selecting collection: questions")
posts = db.questions
logging.debug("selecting posts")
for i in items:
result = posts.insert_one(i)
logging.debug("inserted an item: " + str(result))
except ConnectionFailure as e:
logging.warning('Connection Problems: ' + str(e))
except ConfigurationError as e:
logging.critical('INVALID CREDENTIALS OR INVALID CONFIGURATION: ' + str(e))
except OperationFailure as e:
logging.critical('Operation Failure: ' + str(e))
else:
with open('data/'+time.asctime().replace(":","-").replace(" ", "_"), "w") as f:
f.write(json.dumps(items))
f.close()
if __name__ == "__main__":
main()