-
Notifications
You must be signed in to change notification settings - Fork 57
Expand file tree
/
Copy pathsnoopi_deep.jl
More file actions
311 lines (269 loc) · 11.4 KB
/
snoopi_deep.jl
File metadata and controls
311 lines (269 loc) · 11.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
struct InferenceTiming
mi_info::Core.Compiler.Timings.InferenceFrameInfo
inclusive_time::Float64
exclusive_time::Float64
end
"""
inclusive(frame)
Return the time spent inferring `frame` and its callees.
"""
inclusive(it::InferenceTiming) = it.inclusive_time
"""
exclusive(frame)
Return the time spent inferring `frame`, not including the time needed for any of its callees.
"""
exclusive(it::InferenceTiming) = it.exclusive_time
struct InferenceTimingNode
mi_timing::InferenceTiming
start_time::Float64
children::Vector{InferenceTimingNode}
bt
parent::InferenceTimingNode
# Root constructor
InferenceTimingNode(mi_timing::InferenceTiming, start_time, @nospecialize(bt)) =
new(mi_timing, start_time, InferenceTimingNode[], bt)
# Child constructor
function InferenceTimingNode(mi_timing::InferenceTiming, start_time, @nospecialize(bt), parent::InferenceTimingNode)
child = new(mi_timing, start_time, InferenceTimingNode[], bt, parent)
push!(parent.children, child)
return child
end
end
inclusive(node::InferenceTimingNode) = inclusive(node.mi_timing)
exclusive(node::InferenceTimingNode) = exclusive(node.mi_timing)
InferenceTiming(node::InferenceTimingNode) = node.mi_timing
function InferenceTimingNode(t::Core.Compiler.Timings.Timing)
ttree = timingtree(t)
it, start_time, ttree_children = ttree::Tuple{InferenceTiming, Float64, Vector{Any}}
root = InferenceTimingNode(it, start_time, t.bt)
addchildren!(root, t, ttree_children)
return root
end
# Compute inclusive times and store as a temporary tree.
# To allow InferenceTimingNode to be both bidirectional and immutable, we need to create parent node before the child nodes.
# However, each node stores its inclusive time, which can only be computed efficiently from the leaves up (children before parents).
# This performs the inclusive-time computation, storing the result as a "temporary tree" that can be used during
# InferenceTimingNode creation (see `addchildren!`).
function timingtree(t::Core.Compiler.Timings.Timing)
time, start_time = t.time/10^9, t.start_time/10^9
incl_time = time
tchildren = []
for child in t.children
tchild = timingtree(child)
push!(tchildren, tchild)
incl_time += inclusive(tchild[1])
end
return (InferenceTiming(t.mi_info, incl_time, time), start_time, tchildren)
end
function addchildren!(parent::InferenceTimingNode, t::Core.Compiler.Timings.Timing, ttrees)
for (child, ttree) in zip(t.children, ttrees)
it, start_time, ttree_children = ttree::Tuple{InferenceTiming, Float64, Vector{Any}}
node = InferenceTimingNode(it, start_time, child.bt, parent)
addchildren!(node, child, ttree_children)
end
end
module SnoopiDeepParallelism
# Mutex ordering: MUTEX > jl_typeinf_lock
const MUTEX = ReentrantLock()
mutable struct Invocation
# start_idx is mutated when older invocations are deleted and the profile is shifted.
start_idx::Int
stop_idx::Int
start_time::UInt64
root_start_excl_time::UInt64
root_stop_excl_time::UInt64
end
function Invocation(start_idx, start_root_excl_time)
# Start at the current time.
return Invocation(start_idx, 0, time_ns(), start_root_excl_time, 0)
end
"""
Global (locked) vector tracking running snoopi calls, and when they started.
- When one finishes, we lock(inference), export results, clear the inference profiles up to
the next oldest snoopi call, then unlock(inference).
Imagine this is an ongoing inference profile, where each letter is another inference profile
result, and we start two profiles, 1 and 2, at the times indicated below:
ABCDEFGHIJKLMNOPQRSTUVWX
1> 2> <1 <2
- invocations: [(1,A), (2,D)]
- 1 ends:
copy out ABCDEFGHIJKLMNOPQRSTU
pop (1,A) from invocations
read oldest invocation: (2,D)
delete up to D.
- New profile:
DEFGHIJKLMNOPQRSTUVWX
2> <2
- 2 ends:
copy out DEFGHIJKLMNOPQRSTUVWX
pop (2,D) from invocations
no active invocations, so ...
... delete up to X (end of this profile).
"""
const invocations = Invocation[]
function _current_profile_stats_locked()
ccall(:jl_typeinf_lock_begin, Cvoid, ())
try
inference_root_timing = Core.Compiler.Timings._timings[1]
children = inference_root_timing.children
# Since we were able to grab the lock, we must not be in an inference profile,
# meaning we are in ROOT(). So to get an accurate ROOT timing, we have to add the
# accumulated time since the ROOT was last updated:
accum_root_time = time_ns() - inference_root_timing.cur_start_time
current_root_time = inference_root_timing.time + accum_root_time
return length(children), current_root_time
finally
ccall(:jl_typeinf_lock_end, Cvoid, ())
end
end
function _fetch_profile_buffer_locked(start_idx, stop_idx)
ccall(:jl_typeinf_lock_begin, Cvoid, ())
try
inference_root_timing = Core.Compiler.Timings._timings[1]
children = inference_root_timing.children
return children[start_idx:stop_idx]
finally
ccall(:jl_typeinf_lock_end, Cvoid, ())
end
end
function start_timing_invocation()
# Locking respects mutex ordering.
Base.@lock MUTEX begin
current_profile_length, current_root_time = _current_profile_stats_locked()
profile_start_idx = current_profile_length + 1
invocation = Invocation(profile_start_idx, current_root_time)
push!(invocations, invocation)
return invocation
end
end
function stop_timing_invocation!(invocation)
invocation.stop_idx, invocation.root_stop_excl_time = _current_profile_stats_locked()
end
function finish_timing_invocation_and_clear_profile(invocation)
# Locking respects mutex ordering.
Base.@lock MUTEX begin
# Check if this invocation was the oldest. If so, we'll want to clear the parts of
# the profile only it was using.
if invocations[1] !== invocation
idx = findfirst(==(invocation), invocations)
@assert idx !== nothing "invocation wasn't found in invocations: $invocation."
deleteat!(invocations, idx)
return
end
# Clear this invocation from the invocations vector.
popfirst!(invocations)
# Now clear the global inference profile up to the start of the next invocation.
# If no next invocations, clear them all.
if isempty(invocations)
ccall(:jl_typeinf_lock_begin, Cvoid, ())
try
Core.Compiler.Timings.reset_timings()
finally
ccall(:jl_typeinf_lock_end, Cvoid, ())
end
return
end
# Else, we stop at the next oldest invocation.
next_oldest = invocations[1]
start_idx = next_oldest.start_idx
to_delete = start_idx - 1
if to_delete == 0
return
end
# Shift back the indices for all the running invocations
for running_invocation in invocations
running_invocation.start_idx -= to_delete
running_invocation.stop_idx -= to_delete
end
# Clear the profile up to the start of the new oldest invocation.
ccall(:jl_typeinf_lock_begin, Cvoid, ())
try
inference_root_timing = Core.Compiler.Timings._timings[1]
children = inference_root_timing.children
deleteat!(children, 1:to_delete)
finally
ccall(:jl_typeinf_lock_end, Cvoid, ())
end
end
end
end # module
function start_deep_timing()
invocation = SnoopiDeepParallelism.start_timing_invocation()
Core.Compiler.__set_measure_typeinf(true)
return invocation
end
function stop_deep_timing!(invocation)
Core.Compiler.__set_measure_typeinf(false)
return SnoopiDeepParallelism.stop_timing_invocation!(invocation)
end
function finish_snoopi_deep(invocation)
buffer = SnoopiDeepParallelism._fetch_profile_buffer_locked(invocation.start_idx, invocation.stop_idx)
# Clean up the profile buffer, so that we don't leak memory.
SnoopiDeepParallelism.finish_timing_invocation_and_clear_profile(invocation)
root_node = _create_finished_ROOT_Timing(invocation, buffer)
return InferenceTimingNode(root_node)
end
# The MethodInstance for ROOT(), and default empty values for other fields.
# Copied from julia typeinf
root_inference_frame_info() =
Core.Compiler.Timings.InferenceFrameInfo(Core.Compiler.Timings.ROOTmi, 0x0, Any[], Any[Core.Const(Core.Compiler.Timings.ROOT)], 1)
function _create_finished_ROOT_Timing(invocation, buffer)
total_time = time_ns() - invocation.start_time
# Create a new ROOT() node, specific to this profiling invocation, which wraps the
# current profile buffer, and contains the total time for the profile.
return Core.Compiler.Timings.Timing(
root_inference_frame_info(),
invocation.start_time,
0,
# Total exclusive time spent in ROOT during the lifetime of this node.
invocation.root_stop_excl_time - invocation.root_start_excl_time,
# Use the copied-out section of the profile buffer as the children of ROOT()
buffer,
)
end
function _snoopi_deep(cmd::Expr)
return quote
invocation = start_deep_timing()
try
$(esc(cmd))
finally
stop_deep_timing!(invocation)
end
# return the timing result:
finish_snoopi_deep(invocation)
end
end
"""
tinf = @snoopi_deep commands
Produce a profile of julia's type inference, recording the amount of time spent inferring
every `MethodInstance` processed while executing `commands`. Each fresh entrance to
type inference (whether executed directly in `commands` or because a call was made
by runtime-dispatch) also collects a backtrace so the caller can be identified.
`tinf` is a tree, each node containing data on a particular inference "frame" (the method,
argument-type specializations, parameters, and even any constant-propagated values).
Each reports the [`exclusive`](@ref)/[`inclusive`](@ref) times, where the exclusive
time corresponds to the time spent inferring this frame in and of itself, whereas
the inclusive time includes the time needed to infer all the callees of this frame.
The top-level node in this profile tree is `ROOT`. Uniquely, its exclusive time
corresponds to the time spent _not_ in julia's type inference (codegen, llvm_opt, runtime, etc).
There are many different ways of inspecting and using the data stored in `tinf`.
The simplest is to load the `AbstracTrees` package and display the tree with
`AbstractTrees.print_tree(tinf)`.
See also: `flamegraph`, `flatten`, `inference_triggers`, `SnoopCompile.parcel`,
`runtime_inferencetime`.
# Example
```jldoctest; setup=:(using SnoopCompile), filter=r"([0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?/[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?|\\d direct)"
julia> tinf = @snoopi_deep begin
sort(rand(100)) # Evaluate some code and profile julia's type inference
end
InferenceTimingNode: 0.110018224/0.131464476 on Core.Compiler.Timings.ROOT() with 2 direct children
```
"""
macro snoopi_deep(cmd)
return _snoopi_deep(cmd)
end
# These are okay to come at the top-level because we're only measuring inference, and
# inference results will be cached in a `.ji` file.
precompile(start_deep_timing, ())
precompile(stop_deep_timing!, (SnoopiDeepParallelism.Invocation,))
precompile(finish_snoopi_deep, (SnoopiDeepParallelism.Invocation,))