init new dags
This commit is contained in:
11
Dockerfile
11
Dockerfile
@@ -1,13 +1,12 @@
|
|||||||
FROM apache/airflow:2.7.1
|
FROM apache/airflow:2.7.1
|
||||||
|
|
||||||
USER root
|
USER root
|
||||||
|
# Buat folder scripts dan atur izin akses
|
||||||
# Membuat folder dan membuka akses selebar-lebarnya (777)
|
RUN mkdir -p /opt/airflow/scripts
|
||||||
RUN mkdir -p /opt/airflow/logs /opt/airflow/dags /opt/airflow/plugins \
|
# Copy folder scripts dari Git ke container
|
||||||
&& chown -R airflow:root /opt/airflow \
|
COPY --chown=airflow:root ./scripts /opt/airflow/scripts
|
||||||
&& chmod -R 777 /opt/airflow/logs /opt/airflow/dags /opt/airflow/plugins
|
COPY --chown=airflow:root ./dags /opt/airflow/dags
|
||||||
|
|
||||||
USER airflow
|
USER airflow
|
||||||
|
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
17
dags/main_pipeline.py
Normal file
17
dags/main_pipeline.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
from airflow import DAG
|
||||||
|
from airflow.operators.python import PythonOperator
|
||||||
|
from datetime import datetime
|
||||||
|
# Import fungsi dari folder scripts
|
||||||
|
from scripts.scraper_pddikti import run_scraping_logic
|
||||||
|
|
||||||
|
with DAG(
|
||||||
|
dag_id="etl_akademik_separated_v1",
|
||||||
|
start_date=datetime(2026, 3, 3),
|
||||||
|
schedule_interval="@daily",
|
||||||
|
catchup=False
|
||||||
|
) as dag:
|
||||||
|
|
||||||
|
task_scraping = PythonOperator(
|
||||||
|
task_id="run_pddikti_scraping",
|
||||||
|
python_callable=run_scraping_logic # Memanggil fungsi eksternal
|
||||||
|
)
|
||||||
0
scripts/.gitkeep
Normal file
0
scripts/.gitkeep
Normal file
13
scripts/scraper_pddikti.py
Normal file
13
scripts/scraper_pddikti.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
import requests
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def run_scraping_logic():
|
||||||
|
# Contoh scraping
|
||||||
|
url = "http://universities.hipolabs.com/search?country=Indonesia"
|
||||||
|
res = requests.get(url)
|
||||||
|
df = pd.DataFrame(res.json())
|
||||||
|
|
||||||
|
# Logika cleaning atau upload ke BigQuery ditaruh di sini
|
||||||
|
output_path = "/opt/airflow/logs/data_akademik.csv"
|
||||||
|
df.to_csv(output_path, index=False)
|
||||||
|
return f"Berhasil menarik {len(df)} data."
|
||||||
Reference in New Issue
Block a user