Update scripts/scraper_pddikti.py
This commit is contained in:
@@ -2,12 +2,30 @@ import requests
|
||||
import pandas as pd
|
||||
|
||||
def run_scraping_logic():
|
||||
# Contoh scraping
|
||||
url = "http://universities.hipolabs.com/search?country=Indonesia"
|
||||
res = requests.get(url)
|
||||
df = pd.DataFrame(res.json())
|
||||
print("--- MEMULAI PROSES PENARIKAN DATA DUMMY ---")
|
||||
|
||||
# Logika cleaning atau upload ke BigQuery ditaruh di sini
|
||||
output_path = "/opt/airflow/logs/data_akademik.csv"
|
||||
df.to_csv(output_path, index=False)
|
||||
return f"Berhasil menarik {len(df)} data."
|
||||
# URL API Publik (Data Universitas)
|
||||
url = "http://universities.hipolabs.com/search?country=Indonesia"
|
||||
|
||||
try:
|
||||
# 1. Extract
|
||||
response = requests.get(url, timeout=10)
|
||||
data = response.json()
|
||||
|
||||
# 2. Transform (Gunakan Pandas)
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Kita ambil 5 data teratas saja untuk ditampilkan di log
|
||||
preview_data = df[['name', 'web_pages']].head(5)
|
||||
|
||||
# 3. Load (Simulasi: Tampilkan ke Log Airflow)
|
||||
print("HASIL PENARIKAN DATA (5 Teratas):")
|
||||
print("====================================================")
|
||||
print(preview_data.to_string(index=False))
|
||||
print("====================================================")
|
||||
print(f"Total data yang berhasil ditarik: {len(df)} baris.")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"TERJADI KESALAHAN: {str(e)}")
|
||||
return False
|
||||
Reference in New Issue
Block a user