Skip to content

Commit 74336c8

Browse files
Adrian Coleadriancole
Adrian Cole
authored andcommitted
Rewrites Prometheus filter to neither double-register or double-count
1 parent 37bd398 commit 74336c8

File tree

3 files changed

+148
-12
lines changed

3 files changed

+148
-12
lines changed

zipkin-autoconfigure/metrics-prometheus/src/main/java/zipkin/autoconfigure/metrics/PrometheusMetricsAutoConfiguration.java

Lines changed: 113 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,22 @@
1414

1515
package zipkin.autoconfigure.metrics;
1616

17+
import io.prometheus.client.Histogram;
1718
import io.prometheus.client.hotspot.DefaultExports;
1819
import io.prometheus.client.spring.boot.EnablePrometheusEndpoint;
1920
import io.prometheus.client.spring.boot.EnableSpringBootMetricsCollector;
2021
import io.prometheus.client.spring.web.EnablePrometheusTiming;
22+
import java.io.IOException;
23+
import javax.servlet.AsyncContext;
24+
import javax.servlet.AsyncEvent;
25+
import javax.servlet.AsyncListener;
2126
import javax.servlet.Filter;
27+
import javax.servlet.FilterChain;
28+
import javax.servlet.FilterConfig;
2229
import javax.servlet.ServletException;
30+
import javax.servlet.ServletRequest;
31+
import javax.servlet.ServletResponse;
32+
import javax.servlet.http.HttpServletRequest;
2333
import org.springframework.context.annotation.Bean;
2434
import org.springframework.context.annotation.Configuration;
2535

@@ -32,11 +42,108 @@ public class PrometheusMetricsAutoConfiguration {
3242
DefaultExports.initialize();
3343
}
3444

35-
@Bean
36-
public Filter prometheusMetricsFilter() throws ServletException {
37-
return new io.prometheus.client.filter.MetricsFilter("http_request_duration_seconds",
38-
"Response time histogram",
39-
0,
40-
null);
45+
// Obviates the state bug in MetricsFilter which implicitly registers and hides something you
46+
// can't create twice
47+
static final Histogram http_request_duration_seconds = Histogram.build()
48+
.labelNames("path", "method")
49+
.help("Response time histogram")
50+
.name("http_request_duration_seconds")
51+
.register();
52+
53+
@Bean("http_request_duration_seconds") Histogram http_request_duration_seconds() {
54+
return http_request_duration_seconds;
55+
}
56+
57+
@Bean public Filter prometheusMetricsFilter() {
58+
return new PrometheusDurationFilter();
59+
}
60+
61+
/**
62+
* The normal prometheus metrics filter implicitly registers a histogram which is hidden in a
63+
* field and not deregistered on destroy. A registration of any second instance of that filter
64+
* fails trying to re-register the same collector by design (by brian-brazil). The rationale is
65+
* that you are not supposed to recreate the same histogram. However, this design prevents us from
66+
* doing that. brian-bazil's hard stance on this makes the filter unusable for applications who
67+
* run tests.
68+
*
69+
* <p>This filter replaces the normal prometheus filter, correcting the design flaw by allowing us
70+
* to re-use the JVM singleton. It also corrects a major flaw in the upstream filter which results
71+
* in double-counting of requests when they are performed asynchronously. When the culture changes
72+
* in the prometheus project such that bugs are fixable, please submit this so that it can help
73+
* others. For more info, see "brian's bomb" https://github.com/openzipkin/zipkin/issues/1811
74+
*/
75+
static final class PrometheusDurationFilter implements Filter {
76+
@Override public void init(FilterConfig filterConfig) {
77+
}
78+
79+
/**
80+
* Note that upstream also has a problem which is that it doesn't handle async properly.
81+
* MetricsFilter results in double-counting, which this implementation avoids.
82+
*/
83+
@Override public void doFilter(ServletRequest servletRequest, ServletResponse servletResponse,
84+
FilterChain filterChain) throws IOException, ServletException {
85+
86+
HttpServletRequest request = (HttpServletRequest) servletRequest;
87+
88+
// async servlets will enter the filter twice
89+
if (request.getAttribute("PrometheusDurationFilter") != null) {
90+
filterChain.doFilter(request, servletResponse);
91+
return;
92+
}
93+
94+
request.setAttribute("PrometheusDurationFilter", "true");
95+
96+
Histogram.Timer timer = http_request_duration_seconds
97+
.labels(request.getRequestURI(), request.getMethod())
98+
.startTimer();
99+
100+
try {
101+
filterChain.doFilter(servletRequest, servletResponse);
102+
} finally {
103+
if (request.isAsyncStarted()) { // we don't have the actual response, handle later
104+
request.getAsyncContext().addListener(new CompleteTimer(timer));
105+
} else { // we have a synchronous response, so we can finish the recording
106+
timer.observeDuration();
107+
}
108+
}
109+
}
110+
111+
@Override public void destroy() {
112+
}
113+
}
114+
115+
/** Inspired by WingtipsRequestSpanCompletionAsyncListener */
116+
static final class CompleteTimer implements AsyncListener {
117+
final Histogram.Timer timer;
118+
volatile boolean completed = false;
119+
120+
CompleteTimer(Histogram.Timer timer) {
121+
this.timer = timer;
122+
}
123+
124+
@Override public void onComplete(AsyncEvent e) {
125+
tryComplete();
126+
}
127+
128+
@Override public void onTimeout(AsyncEvent e) {
129+
tryComplete();
130+
}
131+
132+
@Override public void onError(AsyncEvent e) {
133+
tryComplete();
134+
}
135+
136+
/** Only observes the first completion event */
137+
void tryComplete() {
138+
if (completed) return;
139+
timer.observeDuration();
140+
completed = true;
141+
}
142+
143+
/** If another async is created (ex via asyncContext.dispatch), this needs to be re-attached */
144+
@Override public void onStartAsync(AsyncEvent event) {
145+
AsyncContext eventAsyncContext = event.getAsyncContext();
146+
if (eventAsyncContext != null) eventAsyncContext.addListener(this);
147+
}
41148
}
42149
}

zipkin-server/src/test/java/zipkin/server/ZipkinServerConfigurationTest.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
package zipkin.server;
1515

1616
import com.github.kristofa.brave.Brave;
17+
import io.prometheus.client.Histogram;
1718
import org.junit.After;
1819
import org.junit.Test;
1920
import org.springframework.beans.factory.NoSuchBeanDefinitionException;
@@ -27,6 +28,7 @@
2728
import org.springframework.context.annotation.Bean;
2829
import org.springframework.context.annotation.Configuration;
2930
import org.springframework.context.annotation.Import;
31+
import zipkin.autoconfigure.metrics.PrometheusMetricsAutoConfiguration;
3032
import zipkin.autoconfigure.ui.ZipkinUiAutoConfiguration;
3133
import zipkin.server.brave.BraveConfiguration;
3234

@@ -67,6 +69,18 @@ public void httpCollector_canDisable() {
6769
context.getBean(ZipkinHttpCollector.class);
6870
}
6971

72+
@Test public void prometheusHistogram() {
73+
context.register(
74+
PropertyPlaceholderAutoConfiguration.class,
75+
ZipkinServerConfigurationTest.Config.class,
76+
ZipkinServerConfiguration.class,
77+
PrometheusMetricsAutoConfiguration.class
78+
);
79+
context.refresh();
80+
81+
assertThat(context.getBean("http_request_duration_seconds", Histogram.class)).isNotNull();
82+
}
83+
7084
@Test public void query_enabledByDefault() {
7185
context.register(
7286
PropertyPlaceholderAutoConfiguration.class,

zipkin-server/src/test/java/zipkin/server/ZipkinServerIntegrationTest.java

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
package zipkin.server;
1515

1616
import com.jayway.jsonpath.JsonPath;
17+
import io.prometheus.client.Histogram;
1718
import java.io.IOException;
1819
import java.util.List;
1920
import okhttp3.MediaType;
@@ -53,17 +54,16 @@
5354
@RunWith(SpringRunner.class)
5455
public class ZipkinServerIntegrationTest {
5556

56-
@Autowired
57-
InMemoryStorage storage;
58-
@Autowired
59-
ActuateCollectorMetrics metrics;
60-
@LocalServerPort
61-
int zipkinPort;
57+
@Autowired InMemoryStorage storage;
58+
@Autowired ActuateCollectorMetrics metrics;
59+
@Autowired Histogram duration;
60+
@LocalServerPort int zipkinPort;
6261

6362
OkHttpClient client = new OkHttpClient.Builder().followRedirects(false).build();
6463

6564
@Before public void init() {
6665
storage.clear();
66+
duration.clear();
6767
metrics.forTransport("http").reset();
6868
}
6969

@@ -117,6 +117,21 @@ public class ZipkinServerIntegrationTest {
117117
.isEqualTo(spans.size());
118118
}
119119

120+
/** Makes sure the prometheus filter doesn't count twice */
121+
@Test public void writeSpans_updatesPrometheusMetrics() throws Exception {
122+
List<Span> spans = asList(LOTS_OF_SPANS[0], LOTS_OF_SPANS[1], LOTS_OF_SPANS[2]);
123+
byte[] body = Codec.JSON.writeSpans(spans);
124+
post("/api/v1/spans", body);
125+
post("/api/v1/spans", body);
126+
127+
Response response = get("/prometheus");
128+
assertThat(response.isSuccessful()).isTrue();
129+
String prometheus = response.body().string();
130+
131+
assertThat(prometheus)
132+
.contains("http_request_duration_seconds_count{path=\"/api/v1/spans\",method=\"POST\",} 2.0");
133+
}
134+
120135
@Test public void tracesQueryRequiresNoParameters() throws Exception {
121136
byte[] body = Codec.JSON.writeSpans(TRACE);
122137
post("/api/v1/spans", body);

0 commit comments

Comments
 (0)