Skip to content

Commit 08c3ae8

Browse files
BC changes and autotuner support for GPU backend
1 parent 6754d68 commit 08c3ae8

27 files changed

Lines changed: 968 additions & 101 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,6 @@
55
.settings/
66
.idea/
77
build/
8+
autotune/*.json
9+
*.graphit_bin
10+
*.graphit_sbin

autotune/compile_gpu.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
python ../build/bin/graphitc.py -a algotorun.gt -f schedule_0 -o test.cu
22
/usr/local/cuda/bin/nvcc -ccbin /usr/bin/c++ -std=c++11 -I ../src/runtime_lib/ -o test -Xcompiler "-w" -O3 test.cu -DNUM_CTA=80 -DCTA_SIZE=512 -Wno-deprecated-gpu-targets -gencode arch=compute_70,code=sm_70 --use_fast_math -Xptxas "-v -dlcm=ca --maxrregcount=64" -rdc=true -DFRONTIER_MULTIPLIER=3
3+
#/usr/local/cuda/bin/nvcc -ccbin /usr/bin/c++ -std=c++11 -I ../src/runtime_lib/ -o test -Xcompiler "-w" -O3 test.cu -DNUM_CTA=60 -DCTA_SIZE=512 -Wno-deprecated-gpu-targets -gencode arch=compute_61,code=sm_61 --use_fast_math -Xptxas "-v -dlcm=ca --maxrregcount=64" -rdc=true -DFRONTIER_MULTIPLIER=2

autotune/gpu_apps/bfs.gt

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
element Vertex end
2+
element Edge end
3+
4+
const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
5+
const vertices : vertexset{Vertex} = edges.getVertices();
6+
const parent : vector{Vertex}(int) = -1;
7+
8+
9+
func updateEdge(src : Vertex, dst : Vertex)
10+
parent[dst] = src;
11+
end
12+
13+
func toFilter(v : Vertex) -> output : bool
14+
output = parent[v] == -1;
15+
end
16+
17+
func reset(v: Vertex)
18+
parent[v] = -1;
19+
end
20+
21+
func main()
22+
for trail in 0:10
23+
var frontier : vertexset{Vertex} = new vertexset{Vertex}(0);
24+
startTimer();
25+
vertices.apply(reset);
26+
var start_vertex : int = atoi(argv[2]);
27+
frontier.addVertex(start_vertex);
28+
parent[start_vertex] = start_vertex;
29+
30+
#s0# while (frontier.getVertexSetSize() != 0)
31+
#s1# var output : vertexset{Vertex} = edges.from(frontier).to(toFilter).applyModified(updateEdge,parent, true);
32+
delete frontier;
33+
frontier = output;
34+
end
35+
var elapsed_time : float = stopTimer();
36+
delete frontier;
37+
print "elapsed time: ";
38+
print elapsed_time;
39+
end
40+
end
41+

autotune/gpu_apps/cc.gt

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
element Vertex end
2+
element Edge end
3+
4+
const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
5+
6+
const vertices : vertexset{Vertex} = edges.getVertices();
7+
const IDs : vector{Vertex}(int) = 1;
8+
9+
const update: vector[1](int);
10+
11+
func updateEdge(src : Vertex, dst : Vertex)
12+
var src_id: Vertex = IDs[src];
13+
var dst_id: Vertex = IDs[dst];
14+
15+
IDs[dst_id] min= IDs[src_id];
16+
IDs[src_id] min= IDs[dst_id];
17+
end
18+
19+
func init(v : Vertex)
20+
IDs[v] = v;
21+
end
22+
23+
func pjump(v: Vertex)
24+
var y: Vertex = IDs[v];
25+
var x: Vertex = IDs[y];
26+
if x != y
27+
IDs[v] = x;
28+
update[0] = 1;
29+
end
30+
end
31+
32+
func main()
33+
var n : int = edges.getVertices();
34+
for trail in 0:10
35+
var frontier : vertexset{Vertex} = new vertexset{Vertex}(n);
36+
startTimer();
37+
vertices.apply(init);
38+
#s0# while (frontier.getVertexSetSize() != 0)
39+
#s1# var output: vertexset{Vertex} = edges.from(frontier).applyModified(updateEdge,IDs);
40+
delete frontier;
41+
frontier = output;
42+
update[0] = 1;
43+
while update[0] != 0
44+
update[0] = 0;
45+
vertices.apply(pjump);
46+
end
47+
end
48+
var elapsed_time : float = stopTimer();
49+
delete frontier;
50+
print "elapsed time: ";
51+
print elapsed_time;
52+
end
53+
end
54+
55+

autotune/gpu_apps/pagerank.gt

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
element Vertex end
2+
element Edge end
3+
const edges : edgeset{Edge}(Vertex,Vertex) = load (argv[1]);
4+
const vertices : vertexset{Vertex} = edges.getVertices();
5+
const old_rank : vector{Vertex}(float) = 1.0/vertices.size();
6+
const new_rank : vector{Vertex}(float) = 0.0;
7+
const out_degree : vector {Vertex}(int) = edges.getOutDegrees();
8+
const contrib : vector{Vertex}(float) = 0.0;
9+
const error : vector{Vertex}(float) = 0.0;
10+
const damp : float = 0.85;
11+
const beta_score : float = (1.0 - damp) / vertices.size();
12+
13+
func computeContrib(v : Vertex)
14+
contrib[v] = old_rank[v] / out_degree[v];
15+
end
16+
17+
func updateEdge(src : Vertex, dst : Vertex)
18+
new_rank[dst] += contrib[src];
19+
end
20+
21+
func updateVertex(v : Vertex)
22+
var old_score : float = old_rank[v];
23+
new_rank[v] = beta_score + damp*(new_rank[v]);
24+
error[v] = fabs(new_rank[v] - old_rank[v]);
25+
old_rank[v] = new_rank[v];
26+
new_rank[v] = 0.0;
27+
end
28+
29+
func printRank(v : Vertex)
30+
print old_rank[v];
31+
end
32+
33+
func reset(v: Vertex)
34+
old_rank[v] = 1.0/vertices.size();
35+
new_rank[v] = 0.0;
36+
end
37+
38+
func main()
39+
for trail in 0:10
40+
startTimer();
41+
vertices.apply(reset);
42+
#s0# for i in 0:20
43+
vertices.apply(computeContrib);
44+
#s1# edges.apply(updateEdge);
45+
vertices.apply(updateVertex);
46+
end
47+
48+
var elapsed_time : float = stopTimer();
49+
print "elapsed time: ";
50+
print elapsed_time;
51+
end
52+
end
53+

autotune/graphit_gpu_autotuner.py

Lines changed: 104 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -29,57 +29,117 @@ def manipulator(self):
2929
Define the search space by creating a
3030
ConfigurationManipulator
3131
"""
32+
manipulator = ConfigurationManipulator()
33+
if self.args.edge_only:
34+
#manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT', 'EDGE_ONLY']))
35+
manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'EDGE_ONLY']))
36+
manipulator.add_parameter(EnumParameter('EB_0', ['ENABLED', 'DISABLED']))
37+
manipulator.add_parameter(IntegerParameter('BS_0', 1, 20))
38+
else:
39+
#manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT']))
40+
manipulator.add_parameter(EnumParameter('LB_0', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM']))
3241

42+
manipulator.add_parameter(EnumParameter('direction_0', ['PUSH', 'PULL']))
43+
manipulator.add_parameter(EnumParameter('dedup_0', ['ENABLED', 'DISABLED']))
44+
manipulator.add_parameter(EnumParameter('frontier_output_0', ['FUSED', 'UNFUSED_BITMAP', 'UNFUSED_BOOLMAP']))
45+
manipulator.add_parameter(EnumParameter('pull_rep_0', ['BITMAP', 'BOOLMAP']))
3346

47+
if self.args.hybrid_schedule:
48+
#manipulator.add_parameter(EnumParameter('LB_1', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT']))
49+
manipulator.add_parameter(EnumParameter('LB_1', ['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM']))
50+
51+
manipulator.add_parameter(EnumParameter('direction_1', ['PUSH', 'PULL']))
52+
manipulator.add_parameter(EnumParameter('dedup_1', ['ENABLED', 'DISABLED']))
53+
manipulator.add_parameter(EnumParameter('frontier_output_1', ['FUSED', 'UNFUSED_BITMAP', 'UNFUSED_BOOLMAP']))
54+
manipulator.add_parameter(EnumParameter('pull_rep_1', ['BITMAP', 'BOOLMAP']))
55+
56+
# We also choose the hybrid schedule threshold here
57+
manipulator.add_parameter(IntegerParameter('threshold', 0, 1000))
58+
59+
3460

35-
manipulator = ConfigurationManipulator()
36-
manipulator.add_parameter(
37-
EnumParameter('LB',
38-
['VERTEX_BASED','TWC', 'TWCE', 'WM', 'CM', 'STRICT']))
39-
40-
#'edge-aware-dynamic-vertex-parallel' not supported with the latest g++ cilk implementation
41-
manipulator.add_parameter(EnumParameter('direction', ['PUSH', 'PULL']))
42-
manipulator.add_parameter(EnumParameter('dedup', ['ENABLED', 'DISABLED']))
43-
manipulator.add_parameter(EnumParameter('frontier_output', ['FUSED', 'UNFUSED_BITMAP', 'UNFUSED_BOOLMAP']))
4461
# adding new parameters for PriorityGraph (Ordered GraphIt)
45-
manipulator.add_parameter(IntegerParameter('delta', 1, self.args.max_delta))
62+
# Currently since delta is allowed to be configured only once for the entire program, we will make a single decision even if the schedule is hybrid
63+
if self.args.tune_delta:
64+
manipulator.add_parameter(IntegerParameter('delta', 1, self.args.max_delta))
65+
66+
67+
if self.args.kernel_fusion:
68+
manipulator.add_parameter(EnumParameter('kernel_fusion', ['DISABLED', 'ENABLED']))
4669

47-
manipulator.add_parameter(EnumParameter('kernel_fusion', ['DISABLED', 'ENABLED']))
48-
manipulator.add_parameter(EnumParameter('pull_rep', ['BITMAP', 'BOOLMAP']))
4970
return manipulator
5071

5172

5273
def write_cfg_to_schedule(self, cfg):
5374
#write into a schedule file the configuration
54-
direction = cfg['direction']
55-
delta = cfg['delta']
56-
dedup = cfg['dedup']
57-
frontier_output = cfg['frontier_output']
58-
kernel_fusion = cfg['kernel_fusion']
59-
pull_rep = cfg['pull_rep']
60-
LB = cfg['LB']
75+
76+
direction_0 = cfg['direction_0']
77+
if self.args.tune_delta:
78+
delta_0 = cfg['delta']
79+
dedup_0 = cfg['dedup_0']
80+
frontier_output_0 = cfg['frontier_output_0']
81+
pull_rep_0 = cfg['pull_rep_0']
82+
LB_0 = cfg['LB_0']
6183

6284
new_schedule = "schedule:\n"
85+
6386
new_schedule += "SimpleGPUSchedule s1;\n";
64-
new_schedule += "s1.configLoadBalance(" + LB + ");\n"
65-
new_schedule += "s1.configFrontierCreation(" + frontier_output + ");\n"
66-
if direction == "PULL":
67-
new_schedule += "s1.configDirection(PULL, " + pull_rep + ");\n"
87+
if LB_0 == "EDGE_ONLY" and cfg['EB_0'] == "ENABLED":
88+
new_schedule += "s1.configLoadBalance(EDGE_ONLY, BLOCKED, " + str(int(int(self.args.num_vertices)/cfg['BS_0'])) + ");\n"
89+
direction_0 = "PUSH"
90+
else:
91+
new_schedule += "s1.configLoadBalance(" + LB_0 + ");\n"
92+
new_schedule += "s1.configFrontierCreation(" + frontier_output_0 + ");\n"
93+
if direction_0 == "PULL":
94+
new_schedule += "s1.configDirection(PULL, " + pull_rep_0 + ");\n"
6895
else:
6996
new_schedule += "s1.configDirection(PUSH);\n"
70-
new_schedule += "s1.configDelta(" + str(delta) + ");\n"
71-
new_schedule += "s1.configDeduplication(" + dedup + ");\n"
72-
new_schedule += "program->applyGPUSchedule(\"s0:s1\", s1);\n"
73-
new_schedule += "SimpleGPUSchedule s0;\n"
74-
new_schedule += "s0.configKernelFusion(" + kernel_fusion + ");\n"
75-
# We will currently not apply this. Use this after kernel fusion is fixed
76-
#new_schedule += "program->applyGPUSchedule(\"s0\", s0);\n"
97+
if self.args.tune_delta:
98+
new_schedule += "s1.configDelta(" + str(delta_0) + ");\n"
99+
new_schedule += "s1.configDeduplication(" + dedup_0 + ");\n"
100+
101+
if self.args.hybrid_schedule:
102+
direction_1 = cfg['direction_1']
103+
if self.args.tune_delta:
104+
delta_1 = cfg['delta']
105+
dedup_1 = cfg['dedup_1']
106+
frontier_output_1 = cfg['frontier_output_1']
107+
pull_rep_1 = cfg['pull_rep_1']
108+
LB_1 = cfg['LB_1']
109+
110+
#threshold = self.args.hybrid_threshold
111+
threshold = cfg['threshold']
112+
113+
new_schedule += "SimpleGPUSchedule s2;\n";
114+
new_schedule += "s2.configLoadBalance(" + LB_1 + ");\n"
115+
new_schedule += "s2.configFrontierCreation(" + frontier_output_1 + ");\n"
116+
if direction_1 == "PULL":
117+
new_schedule += "s2.configDirection(PULL, " + pull_rep_1 + ");\n"
118+
else:
119+
new_schedule += "s2.configDirection(PUSH);\n"
120+
if self.args.tune_delta:
121+
new_schedule += "s2.configDelta(" + str(delta_1) + ");\n"
122+
new_schedule += "s2.configDeduplication(" + dedup_1 + ");\n"
123+
124+
new_schedule += "HybridGPUSchedule h1(INPUT_VERTEXSET_SIZE, " + str(threshold/1000) + ", s1, s2);\n"
125+
new_schedule += "program->applyGPUSchedule(\"s0:s1\", h1);\n"
126+
127+
else:
128+
new_schedule += "program->applyGPUSchedule(\"s0:s1\", s1);\n"
129+
130+
131+
132+
if self.args.kernel_fusion:
133+
kernel_fusion = cfg['kernel_fusion']
134+
new_schedule += "SimpleGPUSchedule s0;\n"
135+
new_schedule += "s0.configKernelFusion(" + kernel_fusion + ");\n"
136+
new_schedule += "program->applyGPUSchedule(\"s0\", s0);\n"
77137

78138
print (cfg)
79-
print (new_schedule)
139+
#print (new_schedule)
80140

81141
self.new_schedule_file_name = 'schedule_0'
82-
print (self.new_schedule_file_name)
142+
#print (self.new_schedule_file_name)
83143
f1 = open (self.new_schedule_file_name, 'w')
84144
f1.write(new_schedule)
85145
f1.close()
@@ -174,7 +234,7 @@ def compile_and_run(self, desired_result, input, limit):
174234
Compile and run a given configuration then
175235
return performance
176236
"""
177-
print ("input graph: " + self.args.graph)
237+
# print ("input graph: " + self.args.graph)
178238

179239
cfg = desired_result.configuration.data
180240

@@ -190,7 +250,7 @@ def compile_and_run(self, desired_result, input, limit):
190250
def save_final_config(self, configuration):
191251
"""called at the end of tuning"""
192252
print ('Final Configuration:', configuration.data)
193-
self.manipulator().save_to_file(configuration.data,'final_config.json')
253+
self.manipulator().save_to_file(configuration.data, self.args.final_config)
194254

195255

196256

@@ -200,11 +260,21 @@ def save_final_config(self, configuration):
200260
parser.add_argument('--start_vertex', type=str, default="0", help="Start vertex if applicable")
201261

202262
parser.add_argument('--algo_file', type=str, required=True, help='input algorithm file')
263+
parser.add_argument('--final_config', type=str, help='Final config file', default="final_config.json")
203264
parser.add_argument('--default_schedule_file', type=str, required=False, default="", help='default schedule file')
204265
parser.add_argument('--runtime_limit', type=float, default=300, help='a limit on the running time of each program')
205266
parser.add_argument('--max_delta', type=int, default=800000, help='maximum delta used for priority coarsening')
206267
parser.add_argument('--memory_limit', type=int, default=-1,help='set memory limit on unix based systems [does not quite work yet]')
207268
parser.add_argument('--killed_process_report_runtime_limit', type=int, default=0, help='reports runtime_limit when a process is killed by the shell. 0 for disable (default), 1 for enable')
269+
270+
parser.add_argument('--kernel_fusion', type=bool, default=False, help='Choose if you want to also tune kernel fusion')
271+
parser.add_argument('--hybrid_schedule', type=bool, default=False, help='Choose if you want to also explore hybrid schedules')
272+
parser.add_argument('--edge_only', type=bool, default=False, help='Choose if you want to also enable EDGE_ONLY schedules')
273+
parser.add_argument('--num_vertices', type=int, required=True, help='Supply number of vertices in the graph')
274+
parser.add_argument('--tune_delta', type=bool, default=False, help='Also tune the delta parameter')
275+
parser.add_argument('--hybrid_threshold', type=int, default=1000, help='Threshold value on 1000')
276+
277+
208278
args = parser.parse_args()
209279
# pass the argumetns into the tuner
210280
GraphItTuner.main(args)

include/graphit/backend/codegen_gpu/assign_function_context.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class AssignFunctionContext : mir::MIRVisitor {
2020
void visit(mir::UpdatePriorityEdgeSetApplyExpr::Ptr);
2121
void visit(mir::PullEdgeSetApplyExpr::Ptr);
2222
void visit(mir::VertexSetApplyExpr::Ptr);
23+
void visit(mir::VertexSetWhereExpr::Ptr);
2324
private:
2425
MIRContext *mir_context_;
2526
};

include/graphit/backend/codegen_gpu/codegen_gpu.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ class CodeGenGPU: public mir::MIRVisitor{
6969
void genPropertyArrayAlloca(mir::VarDecl::Ptr);
7070

7171
void genFusedWhileLoop(mir::WhileStmt::Ptr);
72-
void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
72+
virtual void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
7373

7474
EdgesetApplyFunctionDeclGenerator* edgeset_apply_func_gen_;
7575

@@ -142,6 +142,12 @@ class CodeGenGPU: public mir::MIRVisitor{
142142

143143
virtual void visit(mir::EnqueueVertex::Ptr) override;
144144

145+
virtual void visit(mir::VertexSetWhereExpr::Ptr) override;
146+
147+
148+
virtual void visit(mir::ListType::Ptr) override;
149+
virtual void visit(mir::ListAllocExpr::Ptr) override;
150+
145151
void genPriorityUpdateOperator(mir::PriorityUpdateOperator::Ptr);
146152

147153
};
@@ -179,14 +185,15 @@ class CodeGenGPUFusedKernel: public CodeGenGPU {
179185
}
180186
current_while_stmt->used_priority_queues.push_back(var);
181187
}
182-
void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr);
188+
virtual void genEdgeSetApplyExpr(mir::EdgeSetApplyExpr::Ptr, mir::Expr::Ptr) override;
183189
virtual void visit(mir::StmtBlock::Ptr) override;
184190
virtual void visit(mir::AssignStmt::Ptr) override;
185191
virtual void visit(mir::VarDecl::Ptr) override;
186192
virtual void visit(mir::VarExpr::Ptr) override;
187193
virtual void visit(mir::PrintStmt::Ptr) override;
188194
virtual void visit(mir::HybridGPUStmt::Ptr) override;
189195
virtual void visit(mir::VertexSetDedupExpr::Ptr) override;
196+
virtual void visit(mir::VertexSetApplyExpr::Ptr) override;
190197

191198
std::string var_name (std::string var) {
192199
//return current_kernel_name + "_" + var;

0 commit comments

Comments
 (0)