init new dags
This commit is contained in:
11
Dockerfile
11
Dockerfile
@@ -1,13 +1,12 @@
|
||||
FROM apache/airflow:2.7.1
|
||||
|
||||
USER root
|
||||
|
||||
# Membuat folder dan membuka akses selebar-lebarnya (777)
|
||||
RUN mkdir -p /opt/airflow/logs /opt/airflow/dags /opt/airflow/plugins \
|
||||
&& chown -R airflow:root /opt/airflow \
|
||||
&& chmod -R 777 /opt/airflow/logs /opt/airflow/dags /opt/airflow/plugins
|
||||
# Buat folder scripts dan atur izin akses
|
||||
RUN mkdir -p /opt/airflow/scripts
|
||||
# Copy folder scripts dari Git ke container
|
||||
COPY --chown=airflow:root ./scripts /opt/airflow/scripts
|
||||
COPY --chown=airflow:root ./dags /opt/airflow/dags
|
||||
|
||||
USER airflow
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
17
dags/main_pipeline.py
Normal file
17
dags/main_pipeline.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from airflow import DAG
|
||||
from airflow.operators.python import PythonOperator
|
||||
from datetime import datetime
|
||||
# Import fungsi dari folder scripts
|
||||
from scripts.scraper_pddikti import run_scraping_logic
|
||||
|
||||
with DAG(
|
||||
dag_id="etl_akademik_separated_v1",
|
||||
start_date=datetime(2026, 3, 3),
|
||||
schedule_interval="@daily",
|
||||
catchup=False
|
||||
) as dag:
|
||||
|
||||
task_scraping = PythonOperator(
|
||||
task_id="run_pddikti_scraping",
|
||||
python_callable=run_scraping_logic # Memanggil fungsi eksternal
|
||||
)
|
||||
0
scripts/.gitkeep
Normal file
0
scripts/.gitkeep
Normal file
13
scripts/scraper_pddikti.py
Normal file
13
scripts/scraper_pddikti.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
def run_scraping_logic():
|
||||
# Contoh scraping
|
||||
url = "http://universities.hipolabs.com/search?country=Indonesia"
|
||||
res = requests.get(url)
|
||||
df = pd.DataFrame(res.json())
|
||||
|
||||
# Logika cleaning atau upload ke BigQuery ditaruh di sini
|
||||
output_path = "/opt/airflow/logs/data_akademik.csv"
|
||||
df.to_csv(output_path, index=False)
|
||||
return f"Berhasil menarik {len(df)} data."
|
||||
Reference in New Issue
Block a user