Update scripts/scraper_pddikti.py

This commit is contained in:
izu
2026-03-03 09:39:02 +00:00
parent 669f93825a
commit a34a70bc29

View File

@@ -2,12 +2,30 @@ import requests
import pandas as pd
def run_scraping_logic():
# Contoh scraping
url = "http://universities.hipolabs.com/search?country=Indonesia"
res = requests.get(url)
df = pd.DataFrame(res.json())
print("--- MEMULAI PROSES PENARIKAN DATA DUMMY ---")
# Logika cleaning atau upload ke BigQuery ditaruh di sini
output_path = "/opt/airflow/logs/data_akademik.csv"
df.to_csv(output_path, index=False)
return f"Berhasil menarik {len(df)} data."
# URL API Publik (Data Universitas)
url = "http://universities.hipolabs.com/search?country=Indonesia"
try:
# 1. Extract
response = requests.get(url, timeout=10)
data = response.json()
# 2. Transform (Gunakan Pandas)
df = pd.DataFrame(data)
# Kita ambil 5 data teratas saja untuk ditampilkan di log
preview_data = df[['name', 'web_pages']].head(5)
# 3. Load (Simulasi: Tampilkan ke Log Airflow)
print("HASIL PENARIKAN DATA (5 Teratas):")
print("====================================================")
print(preview_data.to_string(index=False))
print("====================================================")
print(f"Total data yang berhasil ditarik: {len(df)} baris.")
return True
except Exception as e:
print(f"TERJADI KESALAHAN: {str(e)}")
return False