@@ -50,70 +50,42 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
5050-- ==================================================
5151-- 1. Data preparation
5252-- ==================================================
53-
5453-- Create a database which we will use to prepare data for GDS.
5554CREATE DATABASE IF NOT EXISTS tpch_example;
5655CREATE SCHEMA IF NOT EXISTS tpch_example .gds ;
5756USE SCHEMA tpch_example .gds ;
5857
59- -- GDS expects the data to be in a specific format: a table/view for nodes and a table/view for relationships.
60- -- In addition, GDS requires node identifiers to be globally unique integers.
58+ -- GDS reads data from tables that represent nodes and relationships.
59+ -- Nodes are usually represented by entity tables, like persons or products.
60+ -- Relationships are foreign keys between entity tables (1:1, 1:n) or via mapping tables (n:m).
61+ -- In addition, GDS expects certain naming conventions on column names.
62+ -- If the data is not yet in the right format, we can use views to get there.
6163--
62- -- For our analysis, the nodes will be parts and the orders in which they appeared.
64+ -- For our analysis, we will use two different types of nodes: parts and orders.
65+ -- We want to find similar parts by looking at the orders in which they appeared.
6366-- The relationships will be the line items linking a part to an order.
64- --
65- -- We start by creating the node view for our graph.
66- -- First we need to map the primary keys for parts and orders to globally unique node ids.
67-
68- -- We use a sequence to generate globally unique node identifiers.
69- CREATE OR REPLACE SEQUENCE global_id START = 0 INCREMENT = 1;
70-
71- -- We create two mapping tables, one for parts and one for orders.
72- -- This is necessary because the primary key sets for both tables might overlap.
73- CREATE OR REPLACE TABLE node_mapping_parts(gdsId, p_partkey) AS
74- SELECT global_id.nextval, p_partkey
75- FROM snowflake_sample_data.tpch_sf1.part;
76- CREATE OR REPLACE TABLE node_mapping_orders(gdsId, o_orderkey) AS
77- SELECT global_id.nextval, o_orderkey
78- FROM snowflake_sample_data.tpch_sf1.orders;
79-
80- -- Next, we can create the final node view that we use for our graph projection.
81- -- Note, that the view must contain a column named "nodeId" to be recognized by GDS.
82- -- Any additional column will be used as node property, but we don't need that for this example.
83- CREATE OR REPLACE VIEW nodes(nodeId) AS
84- SELECT nmp.gdsId FROM node_mapping_parts nmp
85- UNION
86- SELECT nmo.gdsId FROM node_mapping_orders nmo;
87-
88- -- Let's quickly verify the cardinality of our views.
89- -- As it is the union of parts and orders, we expect 1,700,000 rows.
90- SELECT count(*) FROM nodes;
91-
92- -- We can now create the relationship view.
93- -- As mentioned earlier, we will use the line items to create relationships between parts and orders.
94- -- We join the line items with parts and orders to get the source and target nodes for our relationships.
95- -- We also join the mapping tables to get the globally unique node ids.
96- -- Note, that the view must contain columns named "sourceNodeId" and "targetNodeId" to be recognized by GDS.
97- -- Any additional column will be used as relationship property, but we don't need that for this example.
98- CREATE OR REPLACE VIEW relationships(sourceNodeId, targetNodeId) AS
99- SELECT
100- nmp.gdsId AS sourceNodeId,
101- nmo.gdsId AS targetNodeId
102- FROM snowflake_sample_data.tpch_sf1.part p
103- -- The first two joins build the relationships between parts and orders
104- JOIN snowflake_sample_data.tpch_sf1.lineitem l
105- ON p.p_partkey = l.l_partkey
106- JOIN snowflake_sample_data.tpch_sf1.orders o
107- ON o.o_orderkey = l.l_orderkey
108- -- The second two joins map the primary keys to globally unique node ids
109- JOIN node_mapping_parts nmp
110- ON nmp.p_partkey = p.p_partkey
111- JOIN node_mapping_orders nmo
112- ON nmo.o_orderkey = o.o_orderkey;
113-
114- -- Let's quickly verify the cardinality of our relationship view.
115- -- As it is the join of parts, line items, and orders, we expect 6,001,215 rows.
116- SELECT count(*) FROM relationships;
67+ -- The result will be a new table containing pairs of parts including their similarity score.
68+
69+ -- We start by creating two views to represent our node tables.
70+ -- GDS requires a node table to contain a 'nodeId' column.
71+ -- Since we do not need any node properties, this will be the only column we project.
72+ -- Note, that the `nodeId` column is used to uniquely identify a node in the table.
73+ -- The uniqueness is usually achieved by using the primary key in that table, here 'p_partkey'.
74+ CREATE OR REPLACE VIEW parts (nodeId) AS
75+ SELECT p .p_partkey AS nodeId FROM snowflake_sample_data .tpch_sf1 .part p;
76+
77+ -- We do the same for the orders by projecting the `o_orderkey` to 'nodeId'.
78+ CREATE OR REPLACE VIEW orders (nodeId) AS
79+ SELECT o .o_orderkey AS nodeId FROM snowflake_sample_data .tpch_sf1 .orders o;
80+
81+ -- The line items represent the relationship between parts and orders.
82+ -- GDS requires a `sourceNodeId` and a `targetNodeId` column to identify.
83+ -- Here, a part is the source of a relationship and an order is the target.
84+ CREATE OR REPLACE VIEW part_in_order (sourceNodeId, targetNodeId) AS
85+ SELECT
86+ l .l_partkey AS sourceNodeId,
87+ l .l_orderkey AS targetNodeId
88+ FROM snowflake_sample_data .tpch_sf1 .lineitem l;
11789
11890-- We have now prepared the data for GDS.
11991
@@ -127,8 +99,8 @@ USE DATABASE Neo4j_GDS;
12799-- Next, we want to consider the warehouse that the GDS application will use to execute queries.
128100-- For this example a MEDIUM size warehouse, so we configure the application's warehouse accordingly
129101ALTER WAREHOUSE Neo4j_GDS_app_warehouse SET WAREHOUSE_SIZE= ' MEDIUM' ;
130- -- A highly performant warehouse will speed up graph projections but does not affect algorithm computation.
131- -- It can therefore be a good idea to alter the warehouse size and make other configuration changes to increase performance when projecting larger amounts of data .
102+ -- A highly performant warehouse can speed up graph projections but does not affect algorithm computation.
103+ -- Especially if the views are more complex than shown in this example, a more performant warehouse is beneficial .
132104-- The warehouse can then be brought back to a less expensive configuration after the projection is done.
133105-- ALTER WAREHOUSE Neo4j_GDS_app_warehouse
134106-- WAREHOUSE_SIZE='X-SMALL';
@@ -169,12 +141,26 @@ CALL gds.create_session('CPU_X64_L');
169141
170142-- Once the session is started, we can project our node and relationship views into a GDS in-memory graph.
171143-- The graph will be identified by the name "parts_in_orders".
172- -- The mandatory parameters are the node table and the relationship table, which we point those to our prepared views.
144+ -- The mandatory parameters are the node tables and the relationship tables.
145+ -- A node table mapping points from a table/view to a node label that is used in the GDS graph.
146+ -- For example, the rows of 'tpch_example.gds.parts' will be nodes labeles as 'Part'.
147+ -- Relationship tables need a bit more configuration.
148+ -- Besides the type that is used in the GDS graph, here 'PART_IN_ORDER', we also need to specify source and target tables.
173149-- We also specify the optional read concurrency to optimize building the graph projection.
174150-- The concurrency can be set to the number of cores available on the compute pool node.
175151SELECT gds .graph_project (' parts_in_orders' , {
176- 'nodeTable': 'tpch_example.gds.nodes',
177- 'relationshipTable': 'tpch_example.gds.relationships',
152+ ' nodeTables' : {
153+ ' tpch_example.gds.parts' : ' Part' ,
154+ ' tpch_example.gds.orders' : ' Order'
155+ },
156+ ' relationshipTables' : {
157+ ' tpch_example.gds.part_in_order' : {
158+ ' type' : ' PART_IN_ORDER' ,
159+ ' source_table' : ' tpch_example.gds.parts' ,
160+ ' target_table' : ' tpch_example.gds.orders' ,
161+ ' orientation' : ' NATURAL'
162+ }
163+ },
178164 ' readConcurrency' : 28
179165});
180166
@@ -192,8 +178,10 @@ SELECT gds.node_similarity('parts_in_orders', {
192178
193179-- Once the algorithm has finished, we can write the results back to Snowflake tables for further analysis.
194180-- We want to write back the similarity relationships between parts.
195- -- The specified table will contain the globally unique source and target node ids and the similarity score.
181+ -- The specified table will contain the original source and target node ids and the similarity score.
196182SELECT gds .write_relationships (' parts_in_orders' , {
183+ ' sourceLabel' : ' Part' ,
184+ ' targetLabel' : ' Part' ,
197185 ' relationshipType' : ' SIMILAR_TO' ,
198186 ' relationshipProperty' : ' similarity' ,
199187 ' table' : ' tpch_example.gds.part_similar_to_part'
@@ -208,19 +196,13 @@ GRANT SELECT ON tpch_example.gds.part_similar_to_part TO ROLE <your_role>;
208196-- Simply speaking, this could be used as a recommendation system for parts.
209197SELECT DISTINCT p_source .p_name , p_target .p_name , sim .similarity
210198FROM snowflake_sample_data .tpch_sf1 .part p_source
211- JOIN tpch_example.gds.node_mapping_parts nmp_source
212- ON p_source.p_partkey = nmp_source.p_partkey
213- JOIN tpch_example.gds.part_similar_to_part sim
214- ON nmp_source.gdsid = sim.sourcenodeid
215- JOIN tpch_example.gds.node_mapping_parts nmp_target
216- ON sim.targetnodeid = nmp_target.gdsid
217- JOIN snowflake_sample_data.tpch_sf1.part p_target
218- ON nmp_target.p_partkey = p_target.p_partkey
219- ORDER BY sim.similarity DESC
220- LIMIT 10;
199+ JOIN tpch_example .gds .part_similar_to_part sim
200+ ON p_source .p_partkey = sim .sourcenodeid
201+ JOIN snowflake_sample_data .tpch_sf1 .part p_target
202+ ON p_target .p_partkey = sim .targetnodeid
203+ ORDER BY sim .similarity DESC LIMIT 10 ;
221204
222205-- The GDS service is a long-running service and should be stopped when not in use.
223206-- Once we completed our analysis, we can stop the session, which suspends the container service.
224207-- We can restart the session at any time to continue our analysis.
225- CALL Neo4j_GDS.gds.stop_session();
226-
208+ CALL gds .stop_session ();
0 commit comments