[Metrics] Re-try failed protocol buffer uploads as well as failed XML uploads.
BUG=109818
TEST=none
Review URL: https://chromiumcodereview.appspot.com/10546044
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@141570 0039d316-1c4b-4281-b951-d872f2087c98
diff --git a/chrome/browser/metrics/metrics_service.cc b/chrome/browser/metrics/metrics_service.cc
index fef349d..7da0645e 100644
--- a/chrome/browser/metrics/metrics_service.cc
+++ b/chrome/browser/metrics/metrics_service.cc
@@ -239,11 +239,6 @@
// Interval, in minutes, between state saves.
const int kSaveStateIntervalMinutes = 5;
-// Used to indicate that the response code is currently not set at all --
-// RESPONSE_CODE_INVALID can sometimes be returned in response to a request if,
-// e.g., the server is down.
-const int kNoResponseCode = net::URLFetcher::RESPONSE_CODE_INVALID - 1;
-
enum ResponseStatus {
UNKNOWN_FAILURE,
SUCCESS,
@@ -1017,8 +1012,12 @@
// We may race here, and send second copy of initial log later.
if (state_ == INITIAL_LOG_READY)
state_ = SENDING_OLD_LOGS;
- MetricsLogManager::StoreType store_type = current_fetch_xml_.get() ?
- MetricsLogManager::PROVISIONAL_STORE : MetricsLogManager::NORMAL_STORE;
+
+ MetricsLogManager::StoreType store_type;
+ if (current_fetch_xml_.get() || current_fetch_proto_.get())
+ store_type = MetricsLogManager::PROVISIONAL_STORE;
+ else
+ store_type = MetricsLogManager::NORMAL_STORE;
log_manager_.StoreStagedLogAsUnsent(store_type);
}
DCHECK(!log_manager_.has_staged_log());
@@ -1121,14 +1120,6 @@
}
MakeStagedLog();
-
- // MakeStagedLog should have prepared log text; if it didn't, skip this
- // upload and hope things work out next time.
- if (log_manager_.staged_log_text().empty()) {
- scheduler_->UploadCancelled();
- return;
- }
-
SendStagedLog();
}
@@ -1139,7 +1130,7 @@
switch (state_) {
case INITIALIZED:
case INIT_TASK_SCHEDULED: // We should be further along by now.
- DCHECK(false);
+ NOTREACHED();
return;
case INIT_TASK_DONE:
@@ -1207,26 +1198,21 @@
PrepareFetchWithStagedLog();
- if (!current_fetch_xml_.get()) {
- DCHECK(!current_fetch_proto_.get());
+ if (!current_fetch_xml_.get() && !current_fetch_proto_.get()) {
// Compression failed, and log discarded :-/.
+ // Skip this upload and hope things work out next time.
log_manager_.DiscardStagedLog();
scheduler_->UploadCancelled();
// TODO(jar): If compression failed, we should have created a tiny log and
// compressed that, so that we can signal that we're losing logs.
return;
}
- // Currently, the staged log for the protobuf version of the data is discarded
- // after we create the URL request, so that there is no chance for
- // re-transmission in case the corresponding XML request fails. We will
- // handle protobuf failures more carefully once that becomes the main
- // pipeline, i.e. once we switch away from the XML pipeline.
- DCHECK(current_fetch_proto_.get() || !log_manager_.has_staged_log_proto());
DCHECK(!waiting_for_asynchronous_reporting_step_);
-
waiting_for_asynchronous_reporting_step_ = true;
- current_fetch_xml_->Start();
+
+ if (current_fetch_xml_.get())
+ current_fetch_xml_->Start();
if (current_fetch_proto_.get())
current_fetch_proto_->Start();
@@ -1234,20 +1220,22 @@
}
void MetricsService::PrepareFetchWithStagedLog() {
- DCHECK(!log_manager_.staged_log_text().empty());
+ DCHECK(log_manager_.has_staged_log());
// Prepare the XML version.
DCHECK(!current_fetch_xml_.get());
- current_fetch_xml_.reset(content::URLFetcher::Create(
- GURL(server_url_xml_), net::URLFetcher::POST, this));
- current_fetch_xml_->SetRequestContext(
- g_browser_process->system_request_context());
- current_fetch_xml_->SetUploadData(kMimeTypeXml,
- log_manager_.staged_log_text().xml);
- // We already drop cookies server-side, but we might as well strip them out
- // client-side as well.
- current_fetch_xml_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
- net::LOAD_DO_NOT_SEND_COOKIES);
+ if (log_manager_.has_staged_log_xml()) {
+ current_fetch_xml_.reset(content::URLFetcher::Create(
+ GURL(server_url_xml_), net::URLFetcher::POST, this));
+ current_fetch_xml_->SetRequestContext(
+ g_browser_process->system_request_context());
+ current_fetch_xml_->SetUploadData(kMimeTypeXml,
+ log_manager_.staged_log_text().xml);
+ // We already drop cookies server-side, but we might as well strip them out
+ // client-side as well.
+ current_fetch_xml_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
+ net::LOAD_DO_NOT_SEND_COOKIES);
+ }
// Prepare the protobuf version.
DCHECK(!current_fetch_proto_.get());
@@ -1262,12 +1250,6 @@
// client-side as well.
current_fetch_proto_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
net::LOAD_DO_NOT_SEND_COOKIES);
-
- // Discard the protobuf version of the staged log, so that we will avoid
- // re-uploading it even if we need to re-upload the XML version.
- // TODO(isherman): Handle protobuf upload failures more gracefully once we
- // transition away from the XML-based pipeline.
- log_manager_.DiscardStagedLogProto();
}
}
@@ -1295,22 +1277,9 @@
}
// We need to wait for two responses: the response to the XML upload, and the
-// response to the protobuf upload. For now, only the XML upload's response
-// affects decisions like whether to retry the upload, whether to abandon the
-// upload because it is too large, etc. However, we still need to wait for the
-// protobuf upload, as we cannot reset |current_fetch_proto_| until we have
-// confirmation that the network request was sent; and the easiest way to do
-// that is to wait for the response. In case the XML upload's response arrives
-// first, we cache that response until the protobuf upload's response also
-// arrives.
-//
-// Note that if the XML upload succeeds but the protobuf upload fails, we will
-// not retry the protobuf upload. If the XML upload fails while the protobuf
-// upload succeeds, we will still avoid re-uploading the protobuf data because
-// we "zap" the data after the first upload attempt. This means that we might
-// lose protobuf uploads when XML ones succeed; but we will never duplicate any
-// protobuf uploads. Protobuf failures should be rare enough to where this
-// should be ok while we have the two pipelines running in parallel.
+// response to the protobuf upload. In the case that exactly one of the uploads
+// fails and needs to be retried, we "zap" the other pipeline's staged log to
+// avoid incorrectly re-uploading it as well.
void MetricsService::OnURLFetchComplete(const net::URLFetcher* source) {
DCHECK(waiting_for_asynchronous_reporting_step_);
@@ -1318,72 +1287,76 @@
// Note however that |source| is aliased to one of these, so we should be
// careful not to delete it too early.
scoped_ptr<net::URLFetcher> s;
+ bool is_xml;
if (source == current_fetch_xml_.get()) {
s.reset(current_fetch_xml_.release());
-
- // Cache the XML responses, in case we still need to wait for the protobuf
- // response.
- response_code_ = source->GetResponseCode();
- response_status_ = StatusToString(source->GetStatus());
- source->GetResponseAsString(&response_data_);
-
- // Log a histogram to track response success vs. failure rates.
- UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.XML",
- ResponseCodeToStatus(response_code_),
- NUM_RESPONSE_STATUSES);
+ is_xml = true;
} else if (source == current_fetch_proto_.get()) {
s.reset(current_fetch_proto_.release());
-
- // Log a histogram to track response success vs. failure rates.
- UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.Protobuf",
- ResponseCodeToStatus(source->GetResponseCode()),
- NUM_RESPONSE_STATUSES);
+ is_xml = false;
} else {
NOTREACHED();
return;
}
- // If we're still waiting for one of the responses, keep waiting...
- if (current_fetch_xml_.get() || current_fetch_proto_.get())
- return;
+ int response_code = source->GetResponseCode();
- // We should only be able to reach here once we've received responses to both
- // the XML and the protobuf requests. We should always have the response code
- // available.
- DCHECK_NE(response_code_, kNoResponseCode);
- waiting_for_asynchronous_reporting_step_ = false;
+ // Log a histogram to track response success vs. failure rates.
+ if (is_xml) {
+ UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.XML",
+ ResponseCodeToStatus(response_code),
+ NUM_RESPONSE_STATUSES);
+ } else {
+ UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.Protobuf",
+ ResponseCodeToStatus(response_code),
+ NUM_RESPONSE_STATUSES);
+ }
// If the upload was provisionally stored, drop it now that the upload is
// known to have gone through.
log_manager_.DiscardLastProvisionalStore();
// Confirm send so that we can move on.
- VLOG(1) << "Metrics response code: " << response_code_
- << " status=" << response_status_;
+ VLOG(1) << "Metrics response code: " << response_code
+ << " status=" << StatusToString(source->GetStatus()) << " for "
+ << (is_xml ? "xml" : "protobuf") << " upload.";
- bool upload_succeeded = response_code_ == 200;
+ bool upload_succeeded = response_code == 200;
// Provide boolean for error recovery (allow us to ignore response_code).
bool discard_log = false;
-
- if (!upload_succeeded &&
- log_manager_.staged_log_text().xml.length() >
- kUploadLogAvoidRetransmitSize) {
- UMA_HISTOGRAM_COUNTS(
- "UMA.Large Rejected Log was Discarded",
- static_cast<int>(log_manager_.staged_log_text().xml.length()));
+ const size_t log_size = is_xml ?
+ log_manager_.staged_log_text().xml.length() :
+ log_manager_.staged_log_text().proto.length();
+ if (!upload_succeeded && log_size > kUploadLogAvoidRetransmitSize) {
+ UMA_HISTOGRAM_COUNTS("UMA.Large Rejected Log was Discarded",
+ static_cast<int>(log_size));
discard_log = true;
- } else if (response_code_ == 400) {
+ } else if (response_code == 400) {
// Bad syntax. Retransmission won't work.
discard_log = true;
}
if (!upload_succeeded && !discard_log) {
- VLOG(1) << "Metrics: transmission attempt returned a failure code: "
- << response_code_ << ". Verify network connectivity";
- LogBadResponseCode();
+ LogBadResponseCode(response_code, is_xml);
} else { // Successful receipt (or we are discarding log).
- VLOG(1) << "Metrics response data: " << response_data_;
+ std::string response_data;
+ source->GetResponseAsString(&response_data);
+ VLOG(1) << "Metrics response data: " << response_data;
+
+ if (is_xml)
+ log_manager_.DiscardStagedLogXml();
+ else
+ log_manager_.DiscardStagedLogProto();
+ }
+
+ // If we're still waiting for one of the responses, keep waiting...
+ if (current_fetch_xml_.get() || current_fetch_proto_.get())
+ return;
+
+ waiting_for_asynchronous_reporting_step_ = false;
+
+ if (!log_manager_.has_staged_log()) {
switch (state_) {
case INITIAL_LOG_READY:
state_ = SENDING_OLD_LOGS;
@@ -1402,18 +1375,23 @@
break;
}
- log_manager_.DiscardStagedLog();
-
if (log_manager_.has_unsent_logs())
DCHECK_LT(state_, SENDING_CURRENT_LOGS);
}
// Error 400 indicates a problem with the log, not with the server, so
// don't consider that a sign that the server is in trouble.
- bool server_is_healthy = upload_succeeded || response_code_ == 400;
+ bool server_is_healthy = upload_succeeded || response_code == 400;
- scheduler_->UploadFinished(server_is_healthy,
- log_manager_.has_unsent_logs());
+ // Note that we are essentially randomly choosing to report either the XML or
+ // the protobuf server's health status, but this is ok: In the case that the
+ // two statuses are not the same, one of the uploads succeeded but the other
+ // did not. In this case, we'll re-try only the upload that failed. This first
+ // re-try might ignore the server's unhealthiness; but the response to the
+ // re-tried upload will correctly propagate the server's health status for any
+ // subsequent re-tries. Hence, we'll at most delay slowing the upload rate by
+ // one re-try, which is fine.
+ scheduler_->UploadFinished(server_is_healthy, log_manager_.has_unsent_logs());
// Collect network stats if UMA upload succeeded.
IOThread* io_thread = g_browser_process->io_thread();
@@ -1422,21 +1400,25 @@
chrome_browser_net::CollectPipeliningCapabilityStatsOnUIThread(
http_pipelining_test_server_, io_thread);
}
-
- // Reset the cached response data.
- response_code_ = kNoResponseCode;
- response_data_ = std::string();
- response_status_ = std::string();
}
-void MetricsService::LogBadResponseCode() {
- VLOG(1) << "Verify your metrics logs are formatted correctly. Verify server "
- "is active at " << server_url_xml_;
+void MetricsService::LogBadResponseCode(int response_code, bool is_xml) {
+ VLOG(1) << "Metrics: transmission attempt returned a failure code: "
+ << response_code << ". Verify network connectivity and that your "
+ << "metrics logs are formatted correctly. Verify server is active at "
+ << (is_xml ? server_url_xml_ : server_url_proto_) << ".";
+
if (!log_manager_.has_staged_log()) {
- VLOG(1) << "METRICS: Recorder shutdown during log transmission.";
+ // TODO(isherman): I don't think this code is reachable...
+ VLOG(1) << "Metrics: Recorder shutdown during log transmission.";
} else {
- VLOG(1) << "METRICS: transmission retry being scheduled for "
- << log_manager_.staged_log_text().xml;
+ if (is_xml) {
+ VLOG(1) << "Metrics: transmission retry being scheduled for \n"
+ << log_manager_.staged_log_text().xml;
+ } else {
+ VLOG(1)
+ << "Metrics: transmission retry being scheduled for protobuf upload.";
+ }
}
}