mirror of
https://github.com/bvanroll/yahooBot.git
synced 2025-08-29 03:52:49 +00:00
finished for today, wrapping up
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
webscraper/log
|
||||||
|
webscraper/data/
|
@@ -1 +0,0 @@
|
|||||||
[{"title": "Lindsey Graham said - dems want to play this subpoena cannon game, all bets are now off - Hillary will be investigative headlines, agree?", "category": "Politics", "description": "A full DOJ & AJ team will open it all up and will not hide, bury or turn their backs on anything.Are dems ready for this? Quid-pro-quo."}, {"title": "Why does Kamala Harris go on TV and compare ICE with the KKK when there are no similarities at all?", "category": "Politics", "description": ""}, {"title": "What do you think about Becky Lynch missing Survivor Series after possibly suffering a broken nose and concussion?", "category": "Wrestling", "description": "https://www.fightful.com/wrestling/exclu...Apparently, Nia Jax stiffed Becky causing her to be cut open during the brawl on Raw. What's your opinion on this?"}, {"title": "Is voter suppression a great idea?", "category": "Politics", "description": null}, {"title": "Now that he never won an NBA championship, is Carmelo Anthony a hall of famer or no?", "category": "Basketball", "description": "Carmelo Anthony\u2019s time with the Houston Rockets seems to be coming to a quick and humiliating end. This comes after an agonizing season with the Thunder and a productive but mostly disappointing tenure with the Knicks. It\u2019s been true for long enough now that it can be said without qualification: Carmelo Anthony............. is bad."}, {"title": "A man offered my friend money to go to Peru and come back smuggling drugs. What can I do to stop this?", "category": "Law & Ethics", "description": null}, {"title": "Conservatives. Your president screwed a porn star while the first lady was pregnant, then paid her off days before the election and lied...?", "category": "Politics", "description": null}, {"title": "What would the country be like right now if Hillary Clinton won the presidency?", "category": "Politics", "description": ""}, {"title": "Describe Donald Trump in three words?", "category": "Politics", "description": ""}, {"title": "What's wrong with gun control?", "category": "Politics", "description": ""}, {"title": "Why can't we just castrate paedophiles?", "category": "Law & Ethics", "description": "It doesn't need too be like physical scalpel and lab coats type of deal, it could be chemical castration and there could be like a 3 strikes and your out type thing. Seems pretty reasonable too me."}, {"title": "If the world is round how come we don't fall off?", "category": "Physics", "description": null}, {"title": "I'm a proud member of the Republican Party?", "category": "Politics", "description": "How about you?"}, {"title": "Why do Republicans see no evidence of collusion between Trump and Russia?", "category": "Optical", "description": ""}, {"title": "Where were you when you heard the news that John F. Kennedy had been killed?", "category": "Politics", "description": ""}, {"title": "66 confirmed dead and over 600 missing. Cons...perhaps it\u2019s time to show your support for your fellow Americans in California rather then?", "category": "Politics", "description": "Taking Trumps lead and bashing them while they\u2019re still fighting these fires and recovery bodies.... You know, be decent human beings for once"}, {"title": "How did Earth get here?", "category": "Astronomy & Space", "description": "Seriously.We have no idea where it came from. Just *poof*- it appeared. People are running around looking up to the sky...We got nothing."}, {"title": "WHY do vegans seem to think they are better than everybody else?", "category": "Vegetarian & Vegan", "description": ""}, {"title": "How often do you change the sheets on your bed?", "category": "Polls & Surveys", "description": ""}, {"title": "Why are wild fires becoming more deadly in California?", "category": "History", "description": ""}]
|
|
@@ -5,21 +5,30 @@ from bs4 import BeautifulSoup
|
|||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
import sys
|
||||||
|
from pymongo import MongoClient
|
||||||
|
from pymongo.errors import ConnectionFailure, ConfigurationError, OperationFailure
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#universal vars
|
#universal vars
|
||||||
|
|
||||||
# debug level
|
# debug level
|
||||||
frm = "%(asctime)-15s %(message)s"
|
frm = "%(asctime)-15s %(message)s"
|
||||||
|
|
||||||
lvl = logging.WARNING
|
lvl = logging.DEBUG
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#logging.basicConfig(level=logging.DEBUG)
|
#logging.basicConfig(level=logging.DEBUG)
|
||||||
#logging.basicConfig(level=lvl,format=frm,file="log")
|
logging.basicConfig(level=lvl,format=frm,filename="log")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#if you want to use mongodb, use this order of
|
||||||
|
|
||||||
def getReq(url):
|
def getReq(url):
|
||||||
try:
|
try:
|
||||||
with closing(get(url, stream=True)) as resp:
|
with closing(get(url, stream=True)) as resp:
|
||||||
@@ -38,6 +47,10 @@ def checkResponse(resp):
|
|||||||
and resp.headers['Content-Type'].lower() is not None
|
and resp.headers['Content-Type'].lower() is not None
|
||||||
and resp.headers['Content-Type'].lower().find('html') > -1)
|
and resp.headers['Content-Type'].lower().find('html') > -1)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
|
||||||
raw_html = getReq("https://answers.yahoo.com/dir/index")
|
raw_html = getReq("https://answers.yahoo.com/dir/index")
|
||||||
html = BeautifulSoup(raw_html, "html.parser")
|
html = BeautifulSoup(raw_html, "html.parser")
|
||||||
|
|
||||||
@@ -55,6 +68,38 @@ for li in html.find_all('li', class_="ya-discover-tile ya-discover-tile-qn Bfc P
|
|||||||
|
|
||||||
items.append(arr)
|
items.append(arr)
|
||||||
|
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
try:
|
||||||
|
client = MongoClient(sys.argv[1])
|
||||||
|
logging.info("connected to db")
|
||||||
|
db = client.yahooAnswers
|
||||||
|
logging.debug("selecting collection: questions")
|
||||||
|
posts = db.questions
|
||||||
|
logging.debug("selecting posts")
|
||||||
|
for i in items:
|
||||||
|
result = posts.insert_one(i)
|
||||||
|
logging.debug("inserted an item: " + str(result))
|
||||||
|
except ConnectionFailure as e:
|
||||||
|
logging.warning('Connection Problems: ' + str(e))
|
||||||
|
except ConfigurationError as e:
|
||||||
|
logging.critical('INVALID CREDENTIALS OR INVALID CONFIGURATION: ' + str(e))
|
||||||
|
except OperationFailure as e:
|
||||||
|
logging.critical('Operation Failure: ' + str(e))
|
||||||
|
else:
|
||||||
with open('data/'+time.asctime().replace(":","-").replace(" ", "_"), "w") as f:
|
with open('data/'+time.asctime().replace(":","-").replace(" ", "_"), "w") as f:
|
||||||
f.write(json.dumps(items))
|
f.write(json.dumps(items))
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
Reference in New Issue
Block a user