[Metrics] Re-try failed protocol buffer uploads as well as failed XML uploads.

BUG=109818
TEST=none


Review URL: https://chromiumcodereview.appspot.com/10546044

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@141570 0039d316-1c4b-4281-b951-d872f2087c98
diff --git a/chrome/browser/metrics/metrics_service.cc b/chrome/browser/metrics/metrics_service.cc
index fef349d..7da0645e 100644
--- a/chrome/browser/metrics/metrics_service.cc
+++ b/chrome/browser/metrics/metrics_service.cc
@@ -239,11 +239,6 @@
 // Interval, in minutes, between state saves.
 const int kSaveStateIntervalMinutes = 5;
 
-// Used to indicate that the response code is currently not set at all --
-// RESPONSE_CODE_INVALID can sometimes be returned in response to a request if,
-// e.g., the server is down.
-const int kNoResponseCode = net::URLFetcher::RESPONSE_CODE_INVALID - 1;
-
 enum ResponseStatus {
   UNKNOWN_FAILURE,
   SUCCESS,
@@ -1017,8 +1012,12 @@
     // We may race here, and send second copy of initial log later.
     if (state_ == INITIAL_LOG_READY)
       state_ = SENDING_OLD_LOGS;
-    MetricsLogManager::StoreType store_type = current_fetch_xml_.get() ?
-        MetricsLogManager::PROVISIONAL_STORE : MetricsLogManager::NORMAL_STORE;
+
+    MetricsLogManager::StoreType store_type;
+    if (current_fetch_xml_.get() || current_fetch_proto_.get())
+      store_type = MetricsLogManager::PROVISIONAL_STORE;
+    else
+      store_type = MetricsLogManager::NORMAL_STORE;
     log_manager_.StoreStagedLogAsUnsent(store_type);
   }
   DCHECK(!log_manager_.has_staged_log());
@@ -1121,14 +1120,6 @@
   }
 
   MakeStagedLog();
-
-  // MakeStagedLog should have prepared log text; if it didn't, skip this
-  // upload and hope things work out next time.
-  if (log_manager_.staged_log_text().empty()) {
-    scheduler_->UploadCancelled();
-    return;
-  }
-
   SendStagedLog();
 }
 
@@ -1139,7 +1130,7 @@
   switch (state_) {
     case INITIALIZED:
     case INIT_TASK_SCHEDULED:  // We should be further along by now.
-      DCHECK(false);
+      NOTREACHED();
       return;
 
     case INIT_TASK_DONE:
@@ -1207,26 +1198,21 @@
 
   PrepareFetchWithStagedLog();
 
-  if (!current_fetch_xml_.get()) {
-    DCHECK(!current_fetch_proto_.get());
+  if (!current_fetch_xml_.get() && !current_fetch_proto_.get()) {
     // Compression failed, and log discarded :-/.
+    // Skip this upload and hope things work out next time.
     log_manager_.DiscardStagedLog();
     scheduler_->UploadCancelled();
     // TODO(jar): If compression failed, we should have created a tiny log and
     // compressed that, so that we can signal that we're losing logs.
     return;
   }
-  // Currently, the staged log for the protobuf version of the data is discarded
-  // after we create the URL request, so that there is no chance for
-  // re-transmission in case the corresponding XML request fails.  We will
-  // handle protobuf failures more carefully once that becomes the main
-  // pipeline, i.e. once we switch away from the XML pipeline.
-  DCHECK(current_fetch_proto_.get() || !log_manager_.has_staged_log_proto());
 
   DCHECK(!waiting_for_asynchronous_reporting_step_);
-
   waiting_for_asynchronous_reporting_step_ = true;
-  current_fetch_xml_->Start();
+
+  if (current_fetch_xml_.get())
+    current_fetch_xml_->Start();
   if (current_fetch_proto_.get())
     current_fetch_proto_->Start();
 
@@ -1234,20 +1220,22 @@
 }
 
 void MetricsService::PrepareFetchWithStagedLog() {
-  DCHECK(!log_manager_.staged_log_text().empty());
+  DCHECK(log_manager_.has_staged_log());
 
   // Prepare the XML version.
   DCHECK(!current_fetch_xml_.get());
-  current_fetch_xml_.reset(content::URLFetcher::Create(
-      GURL(server_url_xml_), net::URLFetcher::POST, this));
-  current_fetch_xml_->SetRequestContext(
-      g_browser_process->system_request_context());
-  current_fetch_xml_->SetUploadData(kMimeTypeXml,
-                                    log_manager_.staged_log_text().xml);
-  // We already drop cookies server-side, but we might as well strip them out
-  // client-side as well.
-  current_fetch_xml_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
-                                   net::LOAD_DO_NOT_SEND_COOKIES);
+  if (log_manager_.has_staged_log_xml()) {
+    current_fetch_xml_.reset(content::URLFetcher::Create(
+        GURL(server_url_xml_), net::URLFetcher::POST, this));
+    current_fetch_xml_->SetRequestContext(
+        g_browser_process->system_request_context());
+    current_fetch_xml_->SetUploadData(kMimeTypeXml,
+                                      log_manager_.staged_log_text().xml);
+    // We already drop cookies server-side, but we might as well strip them out
+    // client-side as well.
+    current_fetch_xml_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
+                                     net::LOAD_DO_NOT_SEND_COOKIES);
+  }
 
   // Prepare the protobuf version.
   DCHECK(!current_fetch_proto_.get());
@@ -1262,12 +1250,6 @@
     // client-side as well.
     current_fetch_proto_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
                                        net::LOAD_DO_NOT_SEND_COOKIES);
-
-    // Discard the protobuf version of the staged log, so that we will avoid
-    // re-uploading it even if we need to re-upload the XML version.
-    // TODO(isherman): Handle protobuf upload failures more gracefully once we
-    // transition away from the XML-based pipeline.
-    log_manager_.DiscardStagedLogProto();
   }
 }
 
@@ -1295,22 +1277,9 @@
 }
 
 // We need to wait for two responses: the response to the XML upload, and the
-// response to the protobuf upload.  For now, only the XML upload's response
-// affects decisions like whether to retry the upload, whether to abandon the
-// upload because it is too large, etc.  However, we still need to wait for the
-// protobuf upload, as we cannot reset |current_fetch_proto_| until we have
-// confirmation that the network request was sent; and the easiest way to do
-// that is to wait for the response.  In case the XML upload's response arrives
-// first, we cache that response until the protobuf upload's response also
-// arrives.
-//
-// Note that if the XML upload succeeds but the protobuf upload fails, we will
-// not retry the protobuf upload.  If the XML upload fails while the protobuf
-// upload succeeds, we will still avoid re-uploading the protobuf data because
-// we "zap" the data after the first upload attempt.  This means that we might
-// lose protobuf uploads when XML ones succeed; but we will never duplicate any
-// protobuf uploads.  Protobuf failures should be rare enough to where this
-// should be ok while we have the two pipelines running in parallel.
+// response to the protobuf upload.  In the case that exactly one of the uploads
+// fails and needs to be retried, we "zap" the other pipeline's staged log to
+// avoid incorrectly re-uploading it as well.
 void MetricsService::OnURLFetchComplete(const net::URLFetcher* source) {
   DCHECK(waiting_for_asynchronous_reporting_step_);
 
@@ -1318,72 +1287,76 @@
   // Note however that |source| is aliased to one of these, so we should be
   // careful not to delete it too early.
   scoped_ptr<net::URLFetcher> s;
+  bool is_xml;
   if (source == current_fetch_xml_.get()) {
     s.reset(current_fetch_xml_.release());
-
-    // Cache the XML responses, in case we still need to wait for the protobuf
-    // response.
-    response_code_ = source->GetResponseCode();
-    response_status_ = StatusToString(source->GetStatus());
-    source->GetResponseAsString(&response_data_);
-
-    // Log a histogram to track response success vs. failure rates.
-    UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.XML",
-                              ResponseCodeToStatus(response_code_),
-                              NUM_RESPONSE_STATUSES);
+    is_xml = true;
   } else if (source == current_fetch_proto_.get()) {
     s.reset(current_fetch_proto_.release());
-
-    // Log a histogram to track response success vs. failure rates.
-    UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.Protobuf",
-                              ResponseCodeToStatus(source->GetResponseCode()),
-                              NUM_RESPONSE_STATUSES);
+    is_xml = false;
   } else {
     NOTREACHED();
     return;
   }
 
-  // If we're still waiting for one of the responses, keep waiting...
-  if (current_fetch_xml_.get() || current_fetch_proto_.get())
-    return;
+  int response_code = source->GetResponseCode();
 
-  // We should only be able to reach here once we've received responses to both
-  // the XML and the protobuf requests.  We should always have the response code
-  // available.
-  DCHECK_NE(response_code_, kNoResponseCode);
-  waiting_for_asynchronous_reporting_step_ = false;
+  // Log a histogram to track response success vs. failure rates.
+  if (is_xml) {
+    UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.XML",
+                              ResponseCodeToStatus(response_code),
+                              NUM_RESPONSE_STATUSES);
+  } else {
+    UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.Protobuf",
+                              ResponseCodeToStatus(response_code),
+                              NUM_RESPONSE_STATUSES);
+  }
 
   // If the upload was provisionally stored, drop it now that the upload is
   // known to have gone through.
   log_manager_.DiscardLastProvisionalStore();
 
   // Confirm send so that we can move on.
-  VLOG(1) << "Metrics response code: " << response_code_
-          << " status=" << response_status_;
+  VLOG(1) << "Metrics response code: " << response_code
+          << " status=" << StatusToString(source->GetStatus()) << " for "
+          << (is_xml ? "xml" : "protobuf") << " upload.";
 
-  bool upload_succeeded = response_code_ == 200;
+  bool upload_succeeded = response_code == 200;
 
   // Provide boolean for error recovery (allow us to ignore response_code).
   bool discard_log = false;
-
-  if (!upload_succeeded &&
-      log_manager_.staged_log_text().xml.length() >
-          kUploadLogAvoidRetransmitSize) {
-    UMA_HISTOGRAM_COUNTS(
-        "UMA.Large Rejected Log was Discarded",
-        static_cast<int>(log_manager_.staged_log_text().xml.length()));
+  const size_t log_size = is_xml ?
+      log_manager_.staged_log_text().xml.length() :
+      log_manager_.staged_log_text().proto.length();
+  if (!upload_succeeded && log_size > kUploadLogAvoidRetransmitSize) {
+    UMA_HISTOGRAM_COUNTS("UMA.Large Rejected Log was Discarded",
+                         static_cast<int>(log_size));
     discard_log = true;
-  } else if (response_code_ == 400) {
+  } else if (response_code == 400) {
     // Bad syntax.  Retransmission won't work.
     discard_log = true;
   }
 
   if (!upload_succeeded && !discard_log) {
-    VLOG(1) << "Metrics: transmission attempt returned a failure code: "
-            << response_code_ << ". Verify network connectivity";
-    LogBadResponseCode();
+    LogBadResponseCode(response_code, is_xml);
   } else {  // Successful receipt (or we are discarding log).
-    VLOG(1) << "Metrics response data: " << response_data_;
+    std::string response_data;
+    source->GetResponseAsString(&response_data);
+    VLOG(1) << "Metrics response data: " << response_data;
+
+    if (is_xml)
+      log_manager_.DiscardStagedLogXml();
+    else
+      log_manager_.DiscardStagedLogProto();
+  }
+
+  // If we're still waiting for one of the responses, keep waiting...
+  if (current_fetch_xml_.get() || current_fetch_proto_.get())
+    return;
+
+  waiting_for_asynchronous_reporting_step_ = false;
+
+  if (!log_manager_.has_staged_log()) {
     switch (state_) {
       case INITIAL_LOG_READY:
         state_ = SENDING_OLD_LOGS;
@@ -1402,18 +1375,23 @@
         break;
     }
 
-    log_manager_.DiscardStagedLog();
-
     if (log_manager_.has_unsent_logs())
       DCHECK_LT(state_, SENDING_CURRENT_LOGS);
   }
 
   // Error 400 indicates a problem with the log, not with the server, so
   // don't consider that a sign that the server is in trouble.
-  bool server_is_healthy = upload_succeeded || response_code_ == 400;
+  bool server_is_healthy = upload_succeeded || response_code == 400;
 
-  scheduler_->UploadFinished(server_is_healthy,
-                             log_manager_.has_unsent_logs());
+  // Note that we are essentially randomly choosing to report either the XML or
+  // the protobuf server's health status, but this is ok: In the case that the
+  // two statuses are not the same, one of the uploads succeeded but the other
+  // did not. In this case, we'll re-try only the upload that failed. This first
+  // re-try might ignore the server's unhealthiness; but the response to the
+  // re-tried upload will correctly propagate the server's health status for any
+  // subsequent re-tries. Hence, we'll at most delay slowing the upload rate by
+  // one re-try, which is fine.
+  scheduler_->UploadFinished(server_is_healthy, log_manager_.has_unsent_logs());
 
   // Collect network stats if UMA upload succeeded.
   IOThread* io_thread = g_browser_process->io_thread();
@@ -1422,21 +1400,25 @@
     chrome_browser_net::CollectPipeliningCapabilityStatsOnUIThread(
         http_pipelining_test_server_, io_thread);
   }
-
-  // Reset the cached response data.
-  response_code_ = kNoResponseCode;
-  response_data_ = std::string();
-  response_status_ = std::string();
 }
 
-void MetricsService::LogBadResponseCode() {
-  VLOG(1) << "Verify your metrics logs are formatted correctly.  Verify server "
-             "is active at " << server_url_xml_;
+void MetricsService::LogBadResponseCode(int response_code, bool is_xml) {
+  VLOG(1) << "Metrics: transmission attempt returned a failure code: "
+          << response_code << ". Verify network connectivity and that your "
+          << "metrics logs are formatted correctly. Verify server is active at "
+          << (is_xml ? server_url_xml_ : server_url_proto_) << ".";
+
   if (!log_manager_.has_staged_log()) {
-    VLOG(1) << "METRICS: Recorder shutdown during log transmission.";
+    // TODO(isherman): I don't think this code is reachable...
+    VLOG(1) << "Metrics: Recorder shutdown during log transmission.";
   } else {
-    VLOG(1) << "METRICS: transmission retry being scheduled for "
-            << log_manager_.staged_log_text().xml;
+    if (is_xml) {
+      VLOG(1) << "Metrics: transmission retry being scheduled for \n"
+              << log_manager_.staged_log_text().xml;
+    } else {
+      VLOG(1)
+          << "Metrics: transmission retry being scheduled for protobuf upload.";
+    }
   }
 }