-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDay-13 with IDC AI Databricks
More file actions
47 lines (36 loc) · 1.57 KB
/
Day-13 with IDC AI Databricks
File metadata and controls
47 lines (36 loc) · 1.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# Day-13 with IDC Databricks AI Challenge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
models = {
"linear": LinearRegression(),
"decision_tree": DecisionTreeRegressor(max_depth=5),
"random_forest": RandomForestRegressor(n_estimators=100)
}
## Loading Data from Databricks table
spark_df = spark.read.table('samples.bakehouse.sales_transactions')
df = spark_df.toPandas()
from sklearn.model_selection import train_test_split
### Using Numeric Columns as features, 'totalPrice' as Target
X = df[['quantity', 'unitPrice']]
y = df['totalPrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
import mlflow
for name, model in models.items():
with mlflow.start_run(run_name=f"{name}_model"):
mlflow.log_param("model_type", name)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
mlflow.log_metric("r2_score", score)
mlflow.sklearn.log_model(model, "model")
print(f"{name}: R² = {score:.4f}")
##Spark ML Pipeline
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR
assembler = VectorAssembler(inputCols=["quantity","unitPrice"], outputCol="features")
lr = SparkLR(featuresCol="features", labelCol="totalPrice")
pipeline = Pipeline(stages=[assembler, lr])
spark_df = spark.table("samples.bakehouse.sales_transactions")
train, test = spark_df.randomSplit([0.8, 0.2])
model = pipeline.fit(train)