finished for today, wrapping up

This commit is contained in:
2018-11-17 02:16:03 +01:00
parent f60fafcfe0
commit 95246f5da7
3 changed files with 63 additions and 17 deletions

View File

@@ -5,21 +5,30 @@ from bs4 import BeautifulSoup
import logging
import json
import time
import sys
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure, ConfigurationError, OperationFailure
#universal vars
# debug level
frm = "%(asctime)-15s %(message)s"
lvl = logging.WARNING
lvl = logging.DEBUG
#logging.basicConfig(level=logging.DEBUG)
#logging.basicConfig(level=lvl,format=frm,file="log")
logging.basicConfig(level=lvl,format=frm,filename="log")
#if you want to use mongodb, use this order of
def getReq(url):
try:
with closing(get(url, stream=True)) as resp:
@@ -38,23 +47,59 @@ def checkResponse(resp):
and resp.headers['Content-Type'].lower() is not None
and resp.headers['Content-Type'].lower().find('html') > -1)
raw_html = getReq("https://answers.yahoo.com/dir/index")
html = BeautifulSoup(raw_html, "html.parser")
def main():
items = []
raw_html = getReq("https://answers.yahoo.com/dir/index")
html = BeautifulSoup(raw_html, "html.parser")
for li in html.find_all('li', class_="ya-discover-tile ya-discover-tile-qn Bfc P-14 Bdbx-1g Bgc-w"):
arr = {}
arr['title'] = ' '.join(li.find('a', class_="title").text.split())
arr['category'] = ' '.join(li.find("div", class_="Clr-888 Fz-12 Lh-18").find('a', class_="Clr-b").text.split())
arr['description'] = ' '.join(li.find('div', class_="fullDesc Mah-130 Ovy-s Fz-13 Lh-18 Ol-n D-n").text.replace('\n', '').split())
if arr['description'].find('Best answer:') > -1:
arr['description'] = None
items.append(arr)
items = []
with open('data/'+time.asctime().replace(":","-").replace(" ", "_"), "w") as f:
f.write(json.dumps(items))
for li in html.find_all('li', class_="ya-discover-tile ya-discover-tile-qn Bfc P-14 Bdbx-1g Bgc-w"):
arr = {}
arr['title'] = ' '.join(li.find('a', class_="title").text.split())
arr['category'] = ' '.join(li.find("div", class_="Clr-888 Fz-12 Lh-18").find('a', class_="Clr-b").text.split())
arr['description'] = ' '.join(li.find('div', class_="fullDesc Mah-130 Ovy-s Fz-13 Lh-18 Ol-n D-n").text.replace('\n', '').split())
if arr['description'].find('Best answer:') > -1:
arr['description'] = None
items.append(arr)
if len(sys.argv) > 1:
try:
client = MongoClient(sys.argv[1])
logging.info("connected to db")
db = client.yahooAnswers
logging.debug("selecting collection: questions")
posts = db.questions
logging.debug("selecting posts")
for i in items:
result = posts.insert_one(i)
logging.debug("inserted an item: " + str(result))
except ConnectionFailure as e:
logging.warning('Connection Problems: ' + str(e))
except ConfigurationError as e:
logging.critical('INVALID CREDENTIALS OR INVALID CONFIGURATION: ' + str(e))
except OperationFailure as e:
logging.critical('Operation Failure: ' + str(e))
else:
with open('data/'+time.asctime().replace(":","-").replace(" ", "_"), "w") as f:
f.write(json.dumps(items))
f.close()
if __name__ == "__main__":
main()