diff --git a/main/src/scraper.py b/main/src/scraper.py index 45813e37ab4a30fc88ef979542335467c6be3151..12436c51dc8cf0cbe4de04504f18e67e7fcbf442 100644 --- a/main/src/scraper.py +++ b/main/src/scraper.py @@ -2,33 +2,6 @@ from bs4 import BeautifulSoup import requests import json -def scrape_yle(url): - - url = 'yle.fi/uutiset' - page = requests.get(url) - - soup = BeautifulSoup(page.content, "html.parser") - results = soup.find(id="ResultsContainer") - job_elements = results.find_all("div", class_="card-content") - #test - #test - python_jobs = results.find_all( - "h2", string=lambda text: "python" in text.lower() - ) - - python_job_elements = [ - h2_element.parent.parent.parent for h2_element in python_jobs - ] - #print(python_job_elements) - for job_element in python_job_elements: - title_element = job_element.find("h2", class_="title is-5") - company_element = job_element.find("h3", class_="subtitle is-6 company") - location_element = job_element.find("p", class_="location") - print("Title:",title_element.text.strip()) - print("Company:",company_element.text.strip()) - print("Location:",location_element.text.strip()) - print() - def scrape_xml_yle(): url = 'https://feeds.yle.fi/uutiset/v1/recent.rss?publisherIds=YLE_UUTISET' artikkelit_lista = [] @@ -56,6 +29,43 @@ def scrape_xml_yle(): print('scrapattu') return artikkelit_lista +def scrape_il(): + url = 'https://www.iltalehti.fi/rss/uutiset.xml' + sivu = requests.get(url) + soup = BeautifulSoup(sivu.content,features='xml') + + artikkelit = soup.find_all('item') + + + + artikkeli_lista = [] + for art in artikkelit: + try: + otsikko = art.find('title').text + pubDate = art.find('pubDate').text + linkki = art.find('link').text + + + uusi_sivu = requests.get(linkki) + soup = BeautifulSoup(uusi_sivu.content,'html.parser') + body = soup.find('div', class_='article-body') + + ps = body.find_all('p') + + teksti = '' + for p in ps: + teksti = teksti + p.text #Teksti on useissa <p> tageissa, joten ne iteroidaan läpi ja lisätään tekstiin + artikkeli = { + 'otsikko': otsikko, + 'pubDate': pubDate, + 'teksti': teksti + } + artikkeli_lista.append(artikkeli) + except: + print("Jotain väärin") + print("Kuinka monta artikkelia",len(artikkeli_lista)) + return artikkeli_lista + def to_json(file): data = json.dumps(file,ensure_ascii=False,indent=1).encode('utf8') print("Data enkoodattu json-utf8 muotoon, decode() muuttaa ihmisluettavaksi") diff --git a/tests/test_scraper.py b/tests/test_scraper.py index 0922cbabaf7f12fa32ea61deed57c044c7ebc2fb..cc903c387937a214000422991031d1c975a74028 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -1,7 +1,8 @@ import unittest from main.src.scraper import scrape_xml_yle +from main.src.scraper import scrape_il -def test_ylescrapet(): +def test_ylescraper(): artikkelit = scrape_xml_yle() #Varmista,että artikkeleja on imuroitu assert len(artikkelit) > 0 @@ -11,3 +12,13 @@ def test_ylescrapet(): for art in artikkelit: assert len(art) == 3 +def test_ilscraper(): + artikkelit = scrape_il() + + assert len(artikkelit) > 0 + + assert type(artikkelit) == list + + for art in artikkelit: + assert len(art) == 3 +