From a34a70bc294005ae9923179f52fcd97d14297023 Mon Sep 17 00:00:00 2001 From: izu Date: Tue, 3 Mar 2026 09:39:02 +0000 Subject: [PATCH] Update scripts/scraper_pddikti.py --- scripts/scraper_pddikti.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/scripts/scraper_pddikti.py b/scripts/scraper_pddikti.py index 14bbdb7..f5b5d68 100644 --- a/scripts/scraper_pddikti.py +++ b/scripts/scraper_pddikti.py @@ -2,12 +2,30 @@ import requests import pandas as pd def run_scraping_logic(): - # Contoh scraping - url = "http://universities.hipolabs.com/search?country=Indonesia" - res = requests.get(url) - df = pd.DataFrame(res.json()) + print("--- MEMULAI PROSES PENARIKAN DATA DUMMY ---") - # Logika cleaning atau upload ke BigQuery ditaruh di sini - output_path = "/opt/airflow/logs/data_akademik.csv" - df.to_csv(output_path, index=False) - return f"Berhasil menarik {len(df)} data." \ No newline at end of file + # URL API Publik (Data Universitas) + url = "http://universities.hipolabs.com/search?country=Indonesia" + + try: + # 1. Extract + response = requests.get(url, timeout=10) + data = response.json() + + # 2. Transform (Gunakan Pandas) + df = pd.DataFrame(data) + + # Kita ambil 5 data teratas saja untuk ditampilkan di log + preview_data = df[['name', 'web_pages']].head(5) + + # 3. Load (Simulasi: Tampilkan ke Log Airflow) + print("HASIL PENARIKAN DATA (5 Teratas):") + print("====================================================") + print(preview_data.to_string(index=False)) + print("====================================================") + print(f"Total data yang berhasil ditarik: {len(df)} baris.") + + return True + except Exception as e: + print(f"TERJADI KESALAHAN: {str(e)}") + return False \ No newline at end of file