Update scripts/scraper_pddikti.py
This commit is contained in:
@@ -2,12 +2,30 @@ import requests
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
def run_scraping_logic():
|
def run_scraping_logic():
|
||||||
# Contoh scraping
|
print("--- MEMULAI PROSES PENARIKAN DATA DUMMY ---")
|
||||||
url = "http://universities.hipolabs.com/search?country=Indonesia"
|
|
||||||
res = requests.get(url)
|
|
||||||
df = pd.DataFrame(res.json())
|
|
||||||
|
|
||||||
# Logika cleaning atau upload ke BigQuery ditaruh di sini
|
# URL API Publik (Data Universitas)
|
||||||
output_path = "/opt/airflow/logs/data_akademik.csv"
|
url = "http://universities.hipolabs.com/search?country=Indonesia"
|
||||||
df.to_csv(output_path, index=False)
|
|
||||||
return f"Berhasil menarik {len(df)} data."
|
try:
|
||||||
|
# 1. Extract
|
||||||
|
response = requests.get(url, timeout=10)
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
# 2. Transform (Gunakan Pandas)
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
|
# Kita ambil 5 data teratas saja untuk ditampilkan di log
|
||||||
|
preview_data = df[['name', 'web_pages']].head(5)
|
||||||
|
|
||||||
|
# 3. Load (Simulasi: Tampilkan ke Log Airflow)
|
||||||
|
print("HASIL PENARIKAN DATA (5 Teratas):")
|
||||||
|
print("====================================================")
|
||||||
|
print(preview_data.to_string(index=False))
|
||||||
|
print("====================================================")
|
||||||
|
print(f"Total data yang berhasil ditarik: {len(df)} baris.")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"TERJADI KESALAHAN: {str(e)}")
|
||||||
|
return False
|
||||||
Reference in New Issue
Block a user