1+ {
2+ "nbformat" : 4 ,
3+ "nbformat_minor" : 0 ,
4+ "metadata" : {
5+ "colab" : {
6+ "provenance" : []
7+ },
8+ "kernelspec" : {
9+ "name" : " python3" ,
10+ "display_name" : " Python 3"
11+ },
12+ "language_info" : {
13+ "name" : " python"
14+ }
15+ },
16+ "cells" : [
17+ {
18+ "cell_type" : " code" ,
19+ "source" : [
20+ " import numpy as np\n " ,
21+ " import pandas as pd\n " ,
22+ " from sklearn.model_selection import train_test_split\n " ,
23+ " from sklearn.feature_extraction.text import CountVectorizer\n " ,
24+ " from tensorflow.keras.models import Sequential\n " ,
25+ " from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n " ,
26+ " from tensorflow.keras.preprocessing.text import Tokenizer\n " ,
27+ " from tensorflow.keras.preprocessing.sequence import pad_sequences"
28+ ],
29+ "metadata" : {
30+ "id" : " q9n8UiMR74n3"
31+ },
32+ "execution_count" : 12 ,
33+ "outputs" : []
34+ },
35+ {
36+ "cell_type" : " code" ,
37+ "source" : [
38+ " data = pd.read_csv('spam_ham_dataset.csv')\n " ,
39+ " X = data['text']\n " ,
40+ " y = data['label']\n " ,
41+ " y = y.map({'ham': 0, 'spam': 1})"
42+ ],
43+ "metadata" : {
44+ "id" : " mzWr2hfq773m"
45+ },
46+ "execution_count" : 13 ,
47+ "outputs" : []
48+ },
49+ {
50+ "cell_type" : " code" ,
51+ "source" : [
52+ " X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)"
53+ ],
54+ "metadata" : {
55+ "id" : " DUbwo5FV7-LB"
56+ },
57+ "execution_count" : 14 ,
58+ "outputs" : []
59+ },
60+ {
61+ "cell_type" : " code" ,
62+ "source" : [
63+ " tokenizer = Tokenizer()\n " ,
64+ " tokenizer.fit_on_texts(X_tr)\n " ,
65+ " X_tr_seq = tokenizer.texts_to_sequences(X_tr)\n " ,
66+ " X_te_seq = tokenizer.texts_to_sequences(X_te)\n " ,
67+ " max_seq_len = 100\n " ,
68+ " X_tr_pad = pad_sequences(X_tr_seq, maxlen=max_seq_len, padding='post')\n " ,
69+ " X_te_pad = pad_sequences(X_te_seq, maxlen=max_seq_len, padding='post')"
70+ ],
71+ "metadata" : {
72+ "id" : " Em0mEiBR8BPb"
73+ },
74+ "execution_count" : 15 ,
75+ "outputs" : []
76+ },
77+ {
78+ "cell_type" : " code" ,
79+ "source" : [
80+ " vocab_size = len(tokenizer.word_index) + 1\n " ,
81+ " model = Sequential()\n " ,
82+ " model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_len))\n " ,
83+ " model.add(LSTM(64, return_sequences=True))\n " ,
84+ " model.add(Dropout(0.5))\n " ,
85+ " model.add(LSTM(64))\n " ,
86+ " model.add(Dense(1, activation='sigmoid'))\n " ,
87+ " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n " ,
88+ " model.fit(X_tr_pad, y_tr, epochs=10, batch_size=32, validation_split=0.2)"
89+ ],
90+ "metadata" : {
91+ "colab" : {
92+ "base_uri" : " https://localhost:8080/"
93+ },
94+ "id" : " s_M5u0tM8Fkb" ,
95+ "outputId" : " 93d7c7f8-40b7-44be-b425-8f57dcc18190"
96+ },
97+ "execution_count" : 16 ,
98+ "outputs" : [
99+ {
100+ "output_type" : " stream" ,
101+ "name" : " stdout" ,
102+ "text" : [
103+ " Epoch 1/10\n " ,
104+ " 104/104 [==============================] - 35s 297ms/step - loss: 0.5534 - accuracy: 0.7482 - val_loss: 0.5267 - val_accuracy: 0.7705\n " ,
105+ " Epoch 2/10\n " ,
106+ " 104/104 [==============================] - 28s 272ms/step - loss: 0.5402 - accuracy: 0.7582 - val_loss: 0.5110 - val_accuracy: 0.7742\n " ,
107+ " Epoch 3/10\n " ,
108+ " 104/104 [==============================] - 29s 277ms/step - loss: 0.4195 - accuracy: 0.8210 - val_loss: 0.2600 - val_accuracy: 0.9118\n " ,
109+ " Epoch 4/10\n " ,
110+ " 104/104 [==============================] - 28s 271ms/step - loss: 0.2672 - accuracy: 0.8951 - val_loss: 0.1177 - val_accuracy: 0.9614\n " ,
111+ " Epoch 5/10\n " ,
112+ " 104/104 [==============================] - 28s 268ms/step - loss: 0.0643 - accuracy: 0.9797 - val_loss: 0.1393 - val_accuracy: 0.9650\n " ,
113+ " Epoch 6/10\n " ,
114+ " 104/104 [==============================] - 28s 268ms/step - loss: 0.0444 - accuracy: 0.9879 - val_loss: 0.1399 - val_accuracy: 0.9710\n " ,
115+ " Epoch 7/10\n " ,
116+ " 104/104 [==============================] - 29s 280ms/step - loss: 0.0451 - accuracy: 0.9912 - val_loss: 0.1501 - val_accuracy: 0.9674\n " ,
117+ " Epoch 8/10\n " ,
118+ " 104/104 [==============================] - 28s 268ms/step - loss: 0.0311 - accuracy: 0.9946 - val_loss: 0.1582 - val_accuracy: 0.9686\n " ,
119+ " Epoch 9/10\n " ,
120+ " 104/104 [==============================] - 29s 279ms/step - loss: 0.0275 - accuracy: 0.9952 - val_loss: 0.1492 - val_accuracy: 0.9710\n " ,
121+ " Epoch 10/10\n " ,
122+ " 104/104 [==============================] - 29s 283ms/step - loss: 0.0249 - accuracy: 0.9955 - val_loss: 0.1553 - val_accuracy: 0.9698\n "
123+ ]
124+ },
125+ {
126+ "output_type" : " execute_result" ,
127+ "data" : {
128+ "text/plain" : [
129+ " <keras.callbacks.History at 0x78bd259d00a0>"
130+ ]
131+ },
132+ "metadata" : {},
133+ "execution_count" : 16
134+ }
135+ ]
136+ },
137+ {
138+ "cell_type" : " code" ,
139+ "source" : [
140+ " model.summary()"
141+ ],
142+ "metadata" : {
143+ "colab" : {
144+ "base_uri" : " https://localhost:8080/"
145+ },
146+ "id" : " 95K3lpbr8SkX" ,
147+ "outputId" : " 72b34217-6840-4f9e-f435-64c8861c5222"
148+ },
149+ "execution_count" : 17 ,
150+ "outputs" : [
151+ {
152+ "output_type" : " stream" ,
153+ "name" : " stdout" ,
154+ "text" : [
155+ " Model: \" sequential_2\"\n " ,
156+ " _________________________________________________________________\n " ,
157+ " Layer (type) Output Shape Param # \n " ,
158+ " =================================================================\n " ,
159+ " embedding_2 (Embedding) (None, 100, 128) 6625664 \n " ,
160+ " \n " ,
161+ " lstm_4 (LSTM) (None, 100, 64) 49408 \n " ,
162+ " \n " ,
163+ " dropout_2 (Dropout) (None, 100, 64) 0 \n " ,
164+ " \n " ,
165+ " lstm_5 (LSTM) (None, 64) 33024 \n " ,
166+ " \n " ,
167+ " dense_2 (Dense) (None, 1) 65 \n " ,
168+ " \n " ,
169+ " =================================================================\n " ,
170+ " Total params: 6,708,161\n " ,
171+ " Trainable params: 6,708,161\n " ,
172+ " Non-trainable params: 0\n " ,
173+ " _________________________________________________________________\n "
174+ ]
175+ }
176+ ]
177+ },
178+ {
179+ "cell_type" : " code" ,
180+ "execution_count" : 18 ,
181+ "metadata" : {
182+ "colab" : {
183+ "base_uri" : " https://localhost:8080/"
184+ },
185+ "id" : " JjiVnmmi5Mm6" ,
186+ "outputId" : " dcd0456d-87ea-454d-a363-1ad8193eb132"
187+ },
188+ "outputs" : [
189+ {
190+ "output_type" : " stream" ,
191+ "name" : " stdout" ,
192+ "text" : [
193+ " 33/33 [==============================] - 1s 42ms/step - loss: 0.1547 - accuracy: 0.9700\n " ,
194+ " Test Loss: 0.1547\n " ,
195+ " Test Accuracy: 0.9700\n "
196+ ]
197+ }
198+ ],
199+ "source" : [
200+ " loss, acc = model.evaluate(X_te_pad, y_te)\n " ,
201+ " print(f\" Test Loss: {loss:.4f}\" )\n " ,
202+ " print(f\" Test Accuracy: {acc:.4f}\" )\n "
203+ ]
204+ },
205+ {
206+ "cell_type" : " code" ,
207+ "source" : [
208+ " email_text = input(\" Enter an email text: \" )\n " ,
209+ " \n " ,
210+ " sequence = tokenizer.texts_to_sequences([email_text])\n " ,
211+ " padded_sequence = pad_sequences(sequence, maxlen=100, padding='post')\n " ,
212+ " prediction = model.predict(padded_sequence)\n " ,
213+ " \n " ,
214+ " if prediction > 0.5:\n " ,
215+ " print(\" Prediction: Spam\" )\n " ,
216+ " else:\n " ,
217+ " print(\" Prediction: Ham\" )"
218+ ],
219+ "metadata" : {
220+ "colab" : {
221+ "base_uri" : " https://localhost:8080/"
222+ },
223+ "id" : " WvGAXfeW7f3h" ,
224+ "outputId" : " 084ba5c2-4e54-47af-d239-b83a6485b511"
225+ },
226+ "execution_count" : 19 ,
227+ "outputs" : [
228+ {
229+ "output_type" : " stream" ,
230+ "name" : " stdout" ,
231+ "text" : [
232+ " Enter an email text: you won 1 mill\n " ,
233+ " 1/1 [==============================] - 1s 855ms/step\n " ,
234+ " Prediction: Spam\n "
235+ ]
236+ }
237+ ]
238+ }
239+ ]
240+ }
0 commit comments