Skip to content
Snippets Groups Projects
Commit 10e4227a authored by Ville Komulainen's avatar Ville Komulainen
Browse files

Scrape iltalehti ja testit talle

parent 9bddf967
Branches
No related tags found
No related merge requests found
Pipeline #51093 passed
...@@ -2,33 +2,6 @@ from bs4 import BeautifulSoup ...@@ -2,33 +2,6 @@ from bs4 import BeautifulSoup
import requests import requests
import json import json
def scrape_yle(url):
url = 'yle.fi/uutiset'
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find(id="ResultsContainer")
job_elements = results.find_all("div", class_="card-content")
#test
#test
python_jobs = results.find_all(
"h2", string=lambda text: "python" in text.lower()
)
python_job_elements = [
h2_element.parent.parent.parent for h2_element in python_jobs
]
#print(python_job_elements)
for job_element in python_job_elements:
title_element = job_element.find("h2", class_="title is-5")
company_element = job_element.find("h3", class_="subtitle is-6 company")
location_element = job_element.find("p", class_="location")
print("Title:",title_element.text.strip())
print("Company:",company_element.text.strip())
print("Location:",location_element.text.strip())
print()
def scrape_xml_yle(): def scrape_xml_yle():
url = 'https://feeds.yle.fi/uutiset/v1/recent.rss?publisherIds=YLE_UUTISET' url = 'https://feeds.yle.fi/uutiset/v1/recent.rss?publisherIds=YLE_UUTISET'
artikkelit_lista = [] artikkelit_lista = []
...@@ -56,6 +29,43 @@ def scrape_xml_yle(): ...@@ -56,6 +29,43 @@ def scrape_xml_yle():
print('scrapattu') print('scrapattu')
return artikkelit_lista return artikkelit_lista
def scrape_il():
url = 'https://www.iltalehti.fi/rss/uutiset.xml'
sivu = requests.get(url)
soup = BeautifulSoup(sivu.content,features='xml')
artikkelit = soup.find_all('item')
artikkeli_lista = []
for art in artikkelit:
try:
otsikko = art.find('title').text
pubDate = art.find('pubDate').text
linkki = art.find('link').text
uusi_sivu = requests.get(linkki)
soup = BeautifulSoup(uusi_sivu.content,'html.parser')
body = soup.find('div', class_='article-body')
ps = body.find_all('p')
teksti = ''
for p in ps:
teksti = teksti + p.text #Teksti on useissa <p> tageissa, joten ne iteroidaan läpi ja lisätään tekstiin
artikkeli = {
'otsikko': otsikko,
'pubDate': pubDate,
'teksti': teksti
}
artikkeli_lista.append(artikkeli)
except:
print("Jotain väärin")
print("Kuinka monta artikkelia",len(artikkeli_lista))
return artikkeli_lista
def to_json(file): def to_json(file):
data = json.dumps(file,ensure_ascii=False,indent=1).encode('utf8') data = json.dumps(file,ensure_ascii=False,indent=1).encode('utf8')
print("Data enkoodattu json-utf8 muotoon, decode() muuttaa ihmisluettavaksi") print("Data enkoodattu json-utf8 muotoon, decode() muuttaa ihmisluettavaksi")
......
import unittest import unittest
from main.src.scraper import scrape_xml_yle from main.src.scraper import scrape_xml_yle
from main.src.scraper import scrape_il
def test_ylescrapet(): def test_ylescraper():
artikkelit = scrape_xml_yle() artikkelit = scrape_xml_yle()
#Varmista,että artikkeleja on imuroitu #Varmista,että artikkeleja on imuroitu
assert len(artikkelit) > 0 assert len(artikkelit) > 0
...@@ -11,3 +12,13 @@ def test_ylescrapet(): ...@@ -11,3 +12,13 @@ def test_ylescrapet():
for art in artikkelit: for art in artikkelit:
assert len(art) == 3 assert len(art) == 3
def test_ilscraper():
artikkelit = scrape_il()
assert len(artikkelit) > 0
assert type(artikkelit) == list
for art in artikkelit:
assert len(art) == 3
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment