From f60fafcfe0f72c1b9cdd327eb78dd1dadb79b30a Mon Sep 17 00:00:00 2001 From: beppe vanrolleghem Date: Sat, 17 Nov 2018 01:04:36 +0100 Subject: [PATCH] webscraper is almost finished --- readme.md | 3 ++ webscraper/data/Sat_Nov_17_01-01-53_2018 | 1 + webscraper/log | 0 webscraper/main.py | 60 ++++++++++++++++++++++++ 4 files changed, 64 insertions(+) create mode 100644 webscraper/data/Sat_Nov_17_01-01-53_2018 create mode 100644 webscraper/log create mode 100644 webscraper/main.py diff --git a/readme.md b/readme.md index 360cf79..69e48ef 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,8 @@ # yahoo answers bot +!!PLEASE USE PYTHON 3 WHEN CONTRIBUTING TO THIS PROJECT!! + + ## part 1, webscraper yahoo answers api has stopped being supported so there has to be a dedicated webscraper. diff --git a/webscraper/data/Sat_Nov_17_01-01-53_2018 b/webscraper/data/Sat_Nov_17_01-01-53_2018 new file mode 100644 index 0000000..82cc154 --- /dev/null +++ b/webscraper/data/Sat_Nov_17_01-01-53_2018 @@ -0,0 +1 @@ +[{"title": "Lindsey Graham said - dems want to play this subpoena cannon game, all bets are now off - Hillary will be investigative headlines, agree?", "category": "Politics", "description": "A full DOJ & AJ team will open it all up and will not hide, bury or turn their backs on anything.Are dems ready for this? Quid-pro-quo."}, {"title": "Why does Kamala Harris go on TV and compare ICE with the KKK when there are no similarities at all?", "category": "Politics", "description": ""}, {"title": "What do you think about Becky Lynch missing Survivor Series after possibly suffering a broken nose and concussion?", "category": "Wrestling", "description": "https://www.fightful.com/wrestling/exclu...Apparently, Nia Jax stiffed Becky causing her to be cut open during the brawl on Raw. What's your opinion on this?"}, {"title": "Is voter suppression a great idea?", "category": "Politics", "description": null}, {"title": "Now that he never won an NBA championship, is Carmelo Anthony a hall of famer or no?", "category": "Basketball", "description": "Carmelo Anthony\u2019s time with the Houston Rockets seems to be coming to a quick and humiliating end. This comes after an agonizing season with the Thunder and a productive but mostly disappointing tenure with the Knicks. It\u2019s been true for long enough now that it can be said without qualification: Carmelo Anthony............. is bad."}, {"title": "A man offered my friend money to go to Peru and come back smuggling drugs. What can I do to stop this?", "category": "Law & Ethics", "description": null}, {"title": "Conservatives. Your president screwed a porn star while the first lady was pregnant, then paid her off days before the election and lied...?", "category": "Politics", "description": null}, {"title": "What would the country be like right now if Hillary Clinton won the presidency?", "category": "Politics", "description": ""}, {"title": "Describe Donald Trump in three words?", "category": "Politics", "description": ""}, {"title": "What's wrong with gun control?", "category": "Politics", "description": ""}, {"title": "Why can't we just castrate paedophiles?", "category": "Law & Ethics", "description": "It doesn't need too be like physical scalpel and lab coats type of deal, it could be chemical castration and there could be like a 3 strikes and your out type thing. Seems pretty reasonable too me."}, {"title": "If the world is round how come we don't fall off?", "category": "Physics", "description": null}, {"title": "I'm a proud member of the Republican Party?", "category": "Politics", "description": "How about you?"}, {"title": "Why do Republicans see no evidence of collusion between Trump and Russia?", "category": "Optical", "description": ""}, {"title": "Where were you when you heard the news that John F. Kennedy had been killed?", "category": "Politics", "description": ""}, {"title": "66 confirmed dead and over 600 missing. Cons...perhaps it\u2019s time to show your support for your fellow Americans in California rather then?", "category": "Politics", "description": "Taking Trumps lead and bashing them while they\u2019re still fighting these fires and recovery bodies.... You know, be decent human beings for once"}, {"title": "How did Earth get here?", "category": "Astronomy & Space", "description": "Seriously.We have no idea where it came from. Just *poof*- it appeared. People are running around looking up to the sky...We got nothing."}, {"title": "WHY do vegans seem to think they are better than everybody else?", "category": "Vegetarian & Vegan", "description": ""}, {"title": "How often do you change the sheets on your bed?", "category": "Polls & Surveys", "description": ""}, {"title": "Why are wild fires becoming more deadly in California?", "category": "History", "description": ""}] \ No newline at end of file diff --git a/webscraper/log b/webscraper/log new file mode 100644 index 0000000..e69de29 diff --git a/webscraper/main.py b/webscraper/main.py new file mode 100644 index 0000000..4435c33 --- /dev/null +++ b/webscraper/main.py @@ -0,0 +1,60 @@ +from requests import get +from requests.exceptions import RequestException +from contextlib import closing +from bs4 import BeautifulSoup +import logging +import json +import time + +#universal vars + +# debug level +frm = "%(asctime)-15s %(message)s" + +lvl = logging.WARNING + + + + +#logging.basicConfig(level=logging.DEBUG) +#logging.basicConfig(level=lvl,format=frm,file="log") + + +def getReq(url): + try: + with closing(get(url, stream=True)) as resp: + if checkResponse(resp): + return resp.content + else: + return None + + except RequestException as e: + logging.warning('error during requests to {} : {}'.format(url, str(e))) + return None + + +def checkResponse(resp): + return (resp.status_code == 200 + and resp.headers['Content-Type'].lower() is not None + and resp.headers['Content-Type'].lower().find('html') > -1) + +raw_html = getReq("https://answers.yahoo.com/dir/index") +html = BeautifulSoup(raw_html, "html.parser") + + +items = [] + + +for li in html.find_all('li', class_="ya-discover-tile ya-discover-tile-qn Bfc P-14 Bdbx-1g Bgc-w"): + arr = {} + arr['title'] = ' '.join(li.find('a', class_="title").text.split()) + arr['category'] = ' '.join(li.find("div", class_="Clr-888 Fz-12 Lh-18").find('a', class_="Clr-b").text.split()) + arr['description'] = ' '.join(li.find('div', class_="fullDesc Mah-130 Ovy-s Fz-13 Lh-18 Ol-n D-n").text.replace('\n', '').split()) + if arr['description'].find('Best answer:') > -1: + arr['description'] = None + + items.append(arr) + + +with open('data/'+time.asctime().replace(":","-").replace(" ", "_"), "w") as f: + f.write(json.dumps(items))