Skip to content
Snippets Groups Projects
Commit 10e4227a authored by Ville Komulainen's avatar Ville Komulainen
Browse files

Scrape iltalehti ja testit talle

parent 9bddf967
No related branches found
No related tags found
No related merge requests found
Pipeline #51093 passed
......@@ -2,33 +2,6 @@ from bs4 import BeautifulSoup
import requests
import json
def scrape_yle(url):
url = 'yle.fi/uutiset'
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find(id="ResultsContainer")
job_elements = results.find_all("div", class_="card-content")
#test
#test
python_jobs = results.find_all(
"h2", string=lambda text: "python" in text.lower()
)
python_job_elements = [
h2_element.parent.parent.parent for h2_element in python_jobs
]
#print(python_job_elements)
for job_element in python_job_elements:
title_element = job_element.find("h2", class_="title is-5")
company_element = job_element.find("h3", class_="subtitle is-6 company")
location_element = job_element.find("p", class_="location")
print("Title:",title_element.text.strip())
print("Company:",company_element.text.strip())
print("Location:",location_element.text.strip())
print()
def scrape_xml_yle():
url = 'https://feeds.yle.fi/uutiset/v1/recent.rss?publisherIds=YLE_UUTISET'
artikkelit_lista = []
......@@ -56,6 +29,43 @@ def scrape_xml_yle():
print('scrapattu')
return artikkelit_lista
def scrape_il():
url = 'https://www.iltalehti.fi/rss/uutiset.xml'
sivu = requests.get(url)
soup = BeautifulSoup(sivu.content,features='xml')
artikkelit = soup.find_all('item')
artikkeli_lista = []
for art in artikkelit:
try:
otsikko = art.find('title').text
pubDate = art.find('pubDate').text
linkki = art.find('link').text
uusi_sivu = requests.get(linkki)
soup = BeautifulSoup(uusi_sivu.content,'html.parser')
body = soup.find('div', class_='article-body')
ps = body.find_all('p')
teksti = ''
for p in ps:
teksti = teksti + p.text #Teksti on useissa <p> tageissa, joten ne iteroidaan läpi ja lisätään tekstiin
artikkeli = {
'otsikko': otsikko,
'pubDate': pubDate,
'teksti': teksti
}
artikkeli_lista.append(artikkeli)
except:
print("Jotain väärin")
print("Kuinka monta artikkelia",len(artikkeli_lista))
return artikkeli_lista
def to_json(file):
data = json.dumps(file,ensure_ascii=False,indent=1).encode('utf8')
print("Data enkoodattu json-utf8 muotoon, decode() muuttaa ihmisluettavaksi")
......
import unittest
from main.src.scraper import scrape_xml_yle
from main.src.scraper import scrape_il
def test_ylescrapet():
def test_ylescraper():
artikkelit = scrape_xml_yle()
#Varmista,että artikkeleja on imuroitu
assert len(artikkelit) > 0
......@@ -11,3 +12,13 @@ def test_ylescrapet():
for art in artikkelit:
assert len(art) == 3
def test_ilscraper():
artikkelit = scrape_il()
assert len(artikkelit) > 0
assert type(artikkelit) == list
for art in artikkelit:
assert len(art) == 3
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment