613 lines
49 KiB
HTML
613 lines
49 KiB
HTML
|
<!DOCTYPE HTML>
|
|||
|
<html lang="en" class="sidebar-visible no-js light">
|
|||
|
<head>
|
|||
|
<!-- Book generated using mdBook -->
|
|||
|
<meta charset="UTF-8">
|
|||
|
<title>Storing UTF-8 Encoded Text with Strings - The Rust Programming Language</title>
|
|||
|
|
|||
|
|
|||
|
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
|
|||
|
<meta name="description" content="">
|
|||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|||
|
<meta name="theme-color" content="#ffffff" />
|
|||
|
|
|||
|
<link rel="shortcut icon" href="favicon.png">
|
|||
|
<link rel="stylesheet" href="css/variables.css">
|
|||
|
<link rel="stylesheet" href="css/general.css">
|
|||
|
<link rel="stylesheet" href="css/chrome.css">
|
|||
|
<link rel="stylesheet" href="css/print.css" media="print">
|
|||
|
|
|||
|
<!-- Fonts -->
|
|||
|
<link rel="stylesheet" href="FontAwesome/css/font-awesome.css">
|
|||
|
<link href="googleFonts/css.css" rel="stylesheet" type="text/css">
|
|||
|
|
|||
|
<!-- Highlight.js Stylesheets -->
|
|||
|
<link rel="stylesheet" href="highlight.css">
|
|||
|
<link rel="stylesheet" href="tomorrow-night.css">
|
|||
|
<link rel="stylesheet" href="ayu-highlight.css">
|
|||
|
|
|||
|
<!-- Custom theme stylesheets -->
|
|||
|
|
|||
|
<link rel="stylesheet" href="ferris.css">
|
|||
|
|
|||
|
<link rel="stylesheet" href="theme/2018-edition.css">
|
|||
|
|
|||
|
|
|||
|
|
|||
|
</head>
|
|||
|
<body>
|
|||
|
<!-- Provide site root to javascript -->
|
|||
|
<script type="text/javascript">
|
|||
|
var path_to_root = "";
|
|||
|
var default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? "light" : "light";
|
|||
|
</script>
|
|||
|
|
|||
|
<!-- Work around some values being stored in localStorage wrapped in quotes -->
|
|||
|
<script type="text/javascript">
|
|||
|
try {
|
|||
|
var theme = localStorage.getItem('mdbook-theme');
|
|||
|
var sidebar = localStorage.getItem('mdbook-sidebar');
|
|||
|
|
|||
|
if (theme.startsWith('"') && theme.endsWith('"')) {
|
|||
|
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
|
|||
|
}
|
|||
|
|
|||
|
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
|
|||
|
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
|
|||
|
}
|
|||
|
} catch (e) { }
|
|||
|
</script>
|
|||
|
|
|||
|
<!-- Set the theme before any content is loaded, prevents flash -->
|
|||
|
<script type="text/javascript">
|
|||
|
var theme;
|
|||
|
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
|
|||
|
if (theme === null || theme === undefined) { theme = default_theme; }
|
|||
|
var html = document.querySelector('html');
|
|||
|
html.classList.remove('no-js')
|
|||
|
html.classList.remove('light')
|
|||
|
html.classList.add(theme);
|
|||
|
html.classList.add('js');
|
|||
|
</script>
|
|||
|
|
|||
|
<!-- Hide / unhide sidebar before it is displayed -->
|
|||
|
<script type="text/javascript">
|
|||
|
var html = document.querySelector('html');
|
|||
|
var sidebar = 'hidden';
|
|||
|
if (document.body.clientWidth >= 1080) {
|
|||
|
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
|
|||
|
sidebar = sidebar || 'visible';
|
|||
|
}
|
|||
|
html.classList.remove('sidebar-visible');
|
|||
|
html.classList.add("sidebar-" + sidebar);
|
|||
|
</script>
|
|||
|
|
|||
|
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
|
|||
|
<div id="sidebar-scrollbox" class="sidebar-scrollbox">
|
|||
|
<ol class="chapter"><li class="expanded affix "><a href="title-page.html">The Rust Programming Language</a></li><li class="expanded affix "><a href="foreword.html">Foreword</a></li><li class="expanded affix "><a href="ch00-00-introduction.html">Introduction</a></li><li class="expanded "><a href="ch01-00-getting-started.html"><strong aria-hidden="true">1.</strong> Getting Started</a></li><li><ol class="section"><li class="expanded "><a href="ch01-01-installation.html"><strong aria-hidden="true">1.1.</strong> Installation</a></li><li class="expanded "><a href="ch01-02-hello-world.html"><strong aria-hidden="true">1.2.</strong> Hello, World!</a></li><li class="expanded "><a href="ch01-03-hello-cargo.html"><strong aria-hidden="true">1.3.</strong> Hello, Cargo!</a></li></ol></li><li class="expanded "><a href="ch02-00-guessing-game-tutorial.html"><strong aria-hidden="true">2.</strong> Programming a Guessing Game</a></li><li class="expanded "><a href="ch03-00-common-programming-concepts.html"><strong aria-hidden="true">3.</strong> Common Programming Concepts</a></li><li><ol class="section"><li class="expanded "><a href="ch03-01-variables-and-mutability.html"><strong aria-hidden="true">3.1.</strong> Variables and Mutability</a></li><li class="expanded "><a href="ch03-02-data-types.html"><strong aria-hidden="true">3.2.</strong> Data Types</a></li><li class="expanded "><a href="ch03-03-how-functions-work.html"><strong aria-hidden="true">3.3.</strong> Functions</a></li><li class="expanded "><a href="ch03-04-comments.html"><strong aria-hidden="true">3.4.</strong> Comments</a></li><li class="expanded "><a href="ch03-05-control-flow.html"><strong aria-hidden="true">3.5.</strong> Control Flow</a></li></ol></li><li class="expanded "><a href="ch04-00-understanding-ownership.html"><strong aria-hidden="true">4.</strong> Understanding Ownership</a></li><li><ol class="section"><li class="expanded "><a href="ch04-01-what-is-ownership.html"><strong aria-hidden="true">4.1.</strong> What is Ownership?</a></li><li class="expanded "><a href="ch04-02-references-and-borrowing.html"><strong aria-hidden="true">4.2.</strong> References and Borrowing</a></li><li class="expanded "><a href="ch04-03-slices.html"><strong aria-hidden="true">4.3.</strong> The Slice Type</a></li></ol></li><li class="expanded "><a href="ch05-00-structs.html"><strong aria-hidden="true">5.</strong> Using Structs to Structure Related Data</a></li><li><ol class="section"><li class="expanded "><a href="ch05-01-defining-structs.html"><strong aria-hidden="true">5.1.</strong> Defining and Instantiating Structs</a></li><li class="expanded "><a href="ch05-02-example-structs.html"><strong aria-hidden="true">5.2.</strong> An Example Program Using Structs</a></li><li class="expanded "><a href="ch05-03-method-syntax.html"><strong aria-hidden="true">5.3.</strong> Method Syntax</a></li></ol></li><li class="expanded "><a href="ch06-00-enums.html"><strong aria-hidden="true">6.</strong> Enums and Pattern Matching</a></li><li><ol class="section"><li class="expanded "><a href="ch06-01-defining-an-enum.html"><strong aria-hidden="true">6.1.</strong> Defining an Enum</a></li><li class="expanded "><a href="ch06-02-match.html"><strong aria-hidden="true">6.2.</strong> The match Control Flow Operator</a></li><li class="expanded "><a href="ch06-03-if-let.html"><strong aria-hidden="true">6.3.</strong> Concise Control Flow with if let</a></li></ol></li><li class="expanded "><a href="ch07-00-managing-growing-projects-with-packages-crates-and-modules.html"><strong aria-hidden="true">7.</strong> Managing Growing Projects with Packages, Crates, and Modules</a></li><li><ol class="section"><li class="expanded "><a href="ch07-01-packages-and-crates.html"><strong aria-hidden="true">7.1.</strong> Packages and Crates</a></li><li class="expanded "><a href="ch07-02-defining-modules-to-control-scope-and-privacy.html"><strong aria-hidden="true">7.2.</strong> Defining Modules to Control Scope and Privacy</a></li><li class="expanded "><a href="ch07-03-paths-for-referring-to-an-item-in-the-module-tree.html"><
|
|||
|
</div>
|
|||
|
<div id="sidebar-resize-handle" class="sidebar-resize-handle"></div>
|
|||
|
</nav>
|
|||
|
|
|||
|
<div id="page-wrapper" class="page-wrapper">
|
|||
|
|
|||
|
<div class="page">
|
|||
|
|
|||
|
<div id="menu-bar" class="menu-bar">
|
|||
|
<div id="menu-bar-sticky-container">
|
|||
|
<div class="left-buttons">
|
|||
|
<button id="sidebar-toggle" class="icon-button" type="button" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
|
|||
|
<i class="fa fa-bars"></i>
|
|||
|
</button>
|
|||
|
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
|
|||
|
<i class="fa fa-paint-brush"></i>
|
|||
|
</button>
|
|||
|
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
|
|||
|
<li role="none"><button role="menuitem" class="theme" id="light">Light (default)</button></li>
|
|||
|
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
|
|||
|
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
|
|||
|
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
|
|||
|
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
|
|||
|
</ul>
|
|||
|
|
|||
|
<button id="search-toggle" class="icon-button" type="button" title="Search. (Shortkey: s)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="S" aria-controls="searchbar">
|
|||
|
<i class="fa fa-search"></i>
|
|||
|
</button>
|
|||
|
|
|||
|
</div>
|
|||
|
|
|||
|
<h1 class="menu-title">The Rust Programming Language</h1>
|
|||
|
|
|||
|
<div class="right-buttons">
|
|||
|
<a href="print.html" title="Print this book" aria-label="Print this book">
|
|||
|
<i id="print-button" class="fa fa-print"></i>
|
|||
|
</a>
|
|||
|
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
<div id="search-wrapper" class="hidden">
|
|||
|
<form id="searchbar-outer" class="searchbar-outer">
|
|||
|
<input type="search" name="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
|
|||
|
</form>
|
|||
|
<div id="searchresults-outer" class="searchresults-outer hidden">
|
|||
|
<div id="searchresults-header" class="searchresults-header"></div>
|
|||
|
<ul id="searchresults">
|
|||
|
</ul>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
|
|||
|
<script type="text/javascript">
|
|||
|
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
|
|||
|
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
|
|||
|
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
|
|||
|
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
|
|||
|
});
|
|||
|
</script>
|
|||
|
|
|||
|
<div id="content" class="content">
|
|||
|
<main>
|
|||
|
<h2><a class="header" href="#storing-utf-8-encoded-text-with-strings" id="storing-utf-8-encoded-text-with-strings">Storing UTF-8 Encoded Text with Strings</a></h2>
|
|||
|
<p>We talked about strings in Chapter 4, but we’ll look at them in more depth now.
|
|||
|
New Rustaceans commonly get stuck on strings for a combination of three
|
|||
|
reasons: Rust’s propensity for exposing possible errors, strings being a more
|
|||
|
complicated data structure than many programmers give them credit for, and
|
|||
|
UTF-8. These factors combine in a way that can seem difficult when you’re
|
|||
|
coming from other programming languages.</p>
|
|||
|
<p>It’s useful to discuss strings in the context of collections because strings
|
|||
|
are implemented as a collection of bytes, plus some methods to provide useful
|
|||
|
functionality when those bytes are interpreted as text. In this section, we’ll
|
|||
|
talk about the operations on <code>String</code> that every collection type has, such as
|
|||
|
creating, updating, and reading. We’ll also discuss the ways in which <code>String</code>
|
|||
|
is different from the other collections, namely how indexing into a <code>String</code> is
|
|||
|
complicated by the differences between how people and computers interpret
|
|||
|
<code>String</code> data.</p>
|
|||
|
<h3><a class="header" href="#what-is-a-string" id="what-is-a-string">What Is a String?</a></h3>
|
|||
|
<p>We’ll first define what we mean by the term <em>string</em>. Rust has only one string
|
|||
|
type in the core language, which is the string slice <code>str</code> that is usually seen
|
|||
|
in its borrowed form <code>&str</code>. In Chapter 4, we talked about <em>string slices</em>,
|
|||
|
which are references to some UTF-8 encoded string data stored elsewhere. String
|
|||
|
literals, for example, are stored in the program’s binary and are therefore
|
|||
|
string slices.</p>
|
|||
|
<p>The <code>String</code> type, which is provided by Rust’s standard library rather than
|
|||
|
coded into the core language, is a growable, mutable, owned, UTF-8 encoded
|
|||
|
string type. When Rustaceans refer to “strings” in Rust, they usually mean the
|
|||
|
<code>String</code> and the string slice <code>&str</code> types, not just one of those types.
|
|||
|
Although this section is largely about <code>String</code>, both types are used heavily in
|
|||
|
Rust’s standard library, and both <code>String</code> and string slices are UTF-8 encoded.</p>
|
|||
|
<p>Rust’s standard library also includes a number of other string types, such as
|
|||
|
<code>OsString</code>, <code>OsStr</code>, <code>CString</code>, and <code>CStr</code>. Library crates can provide even
|
|||
|
more options for storing string data. See how those names all end in <code>String</code>
|
|||
|
or <code>Str</code>? They refer to owned and borrowed variants, just like the <code>String</code> and
|
|||
|
<code>str</code> types you’ve seen previously. These string types can store text in
|
|||
|
different encodings or be represented in memory in a different way, for
|
|||
|
example. We won’t discuss these other string types in this chapter; see their
|
|||
|
API documentation for more about how to use them and when each is appropriate.</p>
|
|||
|
<h3><a class="header" href="#creating-a-new-string" id="creating-a-new-string">Creating a New String</a></h3>
|
|||
|
<p>Many of the same operations available with <code>Vec<T></code> are available with <code>String</code>
|
|||
|
as well, starting with the <code>new</code> function to create a string, shown in Listing
|
|||
|
8-11.</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let mut s = String::new();
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p><span class="caption">Listing 8-11: Creating a new, empty <code>String</code></span></p>
|
|||
|
<p>This line creates a new empty string called <code>s</code>, which we can then load data
|
|||
|
into. Often, we’ll have some initial data that we want to start the string
|
|||
|
with. For that, we use the <code>to_string</code> method, which is available on any type
|
|||
|
that implements the <code>Display</code> trait, as string literals do. Listing 8-12 shows
|
|||
|
two examples.</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let data = "initial contents";
|
|||
|
|
|||
|
let s = data.to_string();
|
|||
|
|
|||
|
// the method also works on a literal directly:
|
|||
|
let s = "initial contents".to_string();
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p><span class="caption">Listing 8-12: Using the <code>to_string</code> method to create a
|
|||
|
<code>String</code> from a string literal</span></p>
|
|||
|
<p>This code creates a string containing <code>initial contents</code>.</p>
|
|||
|
<p>We can also use the function <code>String::from</code> to create a <code>String</code> from a string
|
|||
|
literal. The code in Listing 8-13 is equivalent to the code from Listing 8-12
|
|||
|
that uses <code>to_string</code>.</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let s = String::from("initial contents");
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p><span class="caption">Listing 8-13: Using the <code>String::from</code> function to create
|
|||
|
a <code>String</code> from a string literal</span></p>
|
|||
|
<p>Because strings are used for so many things, we can use many different generic
|
|||
|
APIs for strings, providing us with a lot of options. Some of them can seem
|
|||
|
redundant, but they all have their place! In this case, <code>String::from</code> and
|
|||
|
<code>to_string</code> do the same thing, so which you choose is a matter of style.</p>
|
|||
|
<p>Remember that strings are UTF-8 encoded, so we can include any properly encoded
|
|||
|
data in them, as shown in Listing 8-14.</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let hello = String::from("السلام عليكم");
|
|||
|
let hello = String::from("Dobrý den");
|
|||
|
let hello = String::from("Hello");
|
|||
|
let hello = String::from("שָׁלוֹם");
|
|||
|
let hello = String::from("नमस्ते");
|
|||
|
let hello = String::from("こんにちは");
|
|||
|
let hello = String::from("안녕하세요");
|
|||
|
let hello = String::from("你好");
|
|||
|
let hello = String::from("Olá");
|
|||
|
let hello = String::from("Здравствуйте");
|
|||
|
let hello = String::from("Hola");
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p><span class="caption">Listing 8-14: Storing greetings in different languages in
|
|||
|
strings</span></p>
|
|||
|
<p>All of these are valid <code>String</code> values.</p>
|
|||
|
<h3><a class="header" href="#updating-a-string" id="updating-a-string">Updating a String</a></h3>
|
|||
|
<p>A <code>String</code> can grow in size and its contents can change, just like the contents
|
|||
|
of a <code>Vec<T></code>, if you push more data into it. In addition, you can conveniently
|
|||
|
use the <code>+</code> operator or the <code>format!</code> macro to concatenate <code>String</code> values.</p>
|
|||
|
<h4><a class="header" href="#appending-to-a-string-with-push_str-and-push" id="appending-to-a-string-with-push_str-and-push">Appending to a String with <code>push_str</code> and <code>push</code></a></h4>
|
|||
|
<p>We can grow a <code>String</code> by using the <code>push_str</code> method to append a string slice,
|
|||
|
as shown in Listing 8-15.</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let mut s = String::from("foo");
|
|||
|
s.push_str("bar");
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p><span class="caption">Listing 8-15: Appending a string slice to a <code>String</code>
|
|||
|
using the <code>push_str</code> method</span></p>
|
|||
|
<p>After these two lines, <code>s</code> will contain <code>foobar</code>. The <code>push_str</code> method takes a
|
|||
|
string slice because we don’t necessarily want to take ownership of the
|
|||
|
parameter. For example, the code in Listing 8-16 shows that it would be
|
|||
|
unfortunate if we weren’t able to use <code>s2</code> after appending its contents to <code>s1</code>.</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let mut s1 = String::from("foo");
|
|||
|
let s2 = "bar";
|
|||
|
s1.push_str(s2);
|
|||
|
println!("s2 is {}", s2);
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p><span class="caption">Listing 8-16: Using a string slice after appending its
|
|||
|
contents to a <code>String</code></span></p>
|
|||
|
<p>If the <code>push_str</code> method took ownership of <code>s2</code>, we wouldn’t be able to print
|
|||
|
its value on the last line. However, this code works as we’d expect!</p>
|
|||
|
<p>The <code>push</code> method takes a single character as a parameter and adds it to the
|
|||
|
<code>String</code>. Listing 8-17 shows code that adds the letter <em>l</em> to a <code>String</code> using
|
|||
|
the <code>push</code> method.</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let mut s = String::from("lo");
|
|||
|
s.push('l');
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p><span class="caption">Listing 8-17: Adding one character to a <code>String</code> value
|
|||
|
using <code>push</code></span></p>
|
|||
|
<p>As a result of this code, <code>s</code> will contain <code>lol</code>.</p>
|
|||
|
<h4><a class="header" href="#concatenation-with-the--operator-or-the-format-macro" id="concatenation-with-the--operator-or-the-format-macro">Concatenation with the <code>+</code> Operator or the <code>format!</code> Macro</a></h4>
|
|||
|
<p>Often, you’ll want to combine two existing strings. One way is to use the <code>+</code>
|
|||
|
operator, as shown in Listing 8-18.</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let s1 = String::from("Hello, ");
|
|||
|
let s2 = String::from("world!");
|
|||
|
let s3 = s1 + &s2; // note s1 has been moved here and can no longer be used
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p><span class="caption">Listing 8-18: Using the <code>+</code> operator to combine two
|
|||
|
<code>String</code> values into a new <code>String</code> value</span></p>
|
|||
|
<p>The string <code>s3</code> will contain <code>Hello, world!</code> as a result of this code. The
|
|||
|
reason <code>s1</code> is no longer valid after the addition and the reason we used a
|
|||
|
reference to <code>s2</code> has to do with the signature of the method that gets called
|
|||
|
when we use the <code>+</code> operator. The <code>+</code> operator uses the <code>add</code> method, whose
|
|||
|
signature looks something like this:</p>
|
|||
|
<pre><code class="language-rust ignore">fn add(self, s: &str) -> String {
|
|||
|
</code></pre>
|
|||
|
<p>This isn’t the exact signature that’s in the standard library: in the standard
|
|||
|
library, <code>add</code> is defined using generics. Here, we’re looking at the signature
|
|||
|
of <code>add</code> with concrete types substituted for the generic ones, which is what
|
|||
|
happens when we call this method with <code>String</code> values. We’ll discuss generics
|
|||
|
in Chapter 10. This signature gives us the clues we need to understand the
|
|||
|
tricky bits of the <code>+</code> operator.</p>
|
|||
|
<p>First, <code>s2</code> has an <code>&</code>, meaning that we’re adding a <em>reference</em> of the second
|
|||
|
string to the first string because of the <code>s</code> parameter in the <code>add</code> function:
|
|||
|
we can only add a <code>&str</code> to a <code>String</code>; we can’t add two <code>String</code> values
|
|||
|
together. But wait—the type of <code>&s2</code> is <code>&String</code>, not <code>&str</code>, as specified in
|
|||
|
the second parameter to <code>add</code>. So why does Listing 8-18 compile?</p>
|
|||
|
<p>The reason we’re able to use <code>&s2</code> in the call to <code>add</code> is that the compiler
|
|||
|
can <em>coerce</em> the <code>&String</code> argument into a <code>&str</code>. When we call the <code>add</code>
|
|||
|
method, Rust uses a <em>deref coercion</em>, which here turns <code>&s2</code> into <code>&s2[..]</code>.
|
|||
|
We’ll discuss deref coercion in more depth in Chapter 15. Because <code>add</code> does
|
|||
|
not take ownership of the <code>s</code> parameter, <code>s2</code> will still be a valid <code>String</code>
|
|||
|
after this operation.</p>
|
|||
|
<p>Second, we can see in the signature that <code>add</code> takes ownership of <code>self</code>,
|
|||
|
because <code>self</code> does <em>not</em> have an <code>&</code>. This means <code>s1</code> in Listing 8-18 will be
|
|||
|
moved into the <code>add</code> call and no longer be valid after that. So although <code>let s3 = s1 + &s2;</code> looks like it will copy both strings and create a new one, this
|
|||
|
statement actually takes ownership of <code>s1</code>, appends a copy of the contents of
|
|||
|
<code>s2</code>, and then returns ownership of the result. In other words, it looks like
|
|||
|
it’s making a lot of copies but isn’t; the implementation is more efficient
|
|||
|
than copying.</p>
|
|||
|
<p>If we need to concatenate multiple strings, the behavior of the <code>+</code> operator
|
|||
|
gets unwieldy:</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let s1 = String::from("tic");
|
|||
|
let s2 = String::from("tac");
|
|||
|
let s3 = String::from("toe");
|
|||
|
|
|||
|
let s = s1 + "-" + &s2 + "-" + &s3;
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p>At this point, <code>s</code> will be <code>tic-tac-toe</code>. With all of the <code>+</code> and <code>"</code>
|
|||
|
characters, it’s difficult to see what’s going on. For more complicated string
|
|||
|
combining, we can use the <code>format!</code> macro:</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let s1 = String::from("tic");
|
|||
|
let s2 = String::from("tac");
|
|||
|
let s3 = String::from("toe");
|
|||
|
|
|||
|
let s = format!("{}-{}-{}", s1, s2, s3);
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p>This code also sets <code>s</code> to <code>tic-tac-toe</code>. The <code>format!</code> macro works in the same
|
|||
|
way as <code>println!</code>, but instead of printing the output to the screen, it returns
|
|||
|
a <code>String</code> with the contents. The version of the code using <code>format!</code> is much
|
|||
|
easier to read and doesn’t take ownership of any of its parameters.</p>
|
|||
|
<h3><a class="header" href="#indexing-into-strings" id="indexing-into-strings">Indexing into Strings</a></h3>
|
|||
|
<p>In many other programming languages, accessing individual characters in a
|
|||
|
string by referencing them by index is a valid and common operation. However,
|
|||
|
if you try to access parts of a <code>String</code> using indexing syntax in Rust, you’ll
|
|||
|
get an error. Consider the invalid code in Listing 8-19.</p>
|
|||
|
<pre><code class="language-rust ignore does_not_compile">let s1 = String::from("hello");
|
|||
|
let h = s1[0];
|
|||
|
</code></pre>
|
|||
|
<p><span class="caption">Listing 8-19: Attempting to use indexing syntax with a
|
|||
|
String</span></p>
|
|||
|
<p>This code will result in the following error:</p>
|
|||
|
<pre><code class="language-text">error[E0277]: the trait bound `std::string::String: std::ops::Index<{integer}>` is not satisfied
|
|||
|
-->
|
|||
|
|
|
|||
|
3 | let h = s1[0];
|
|||
|
| ^^^^^ the type `std::string::String` cannot be indexed by `{integer}`
|
|||
|
|
|
|||
|
= help: the trait `std::ops::Index<{integer}>` is not implemented for `std::string::String`
|
|||
|
</code></pre>
|
|||
|
<p>The error and the note tell the story: Rust strings don’t support indexing. But
|
|||
|
why not? To answer that question, we need to discuss how Rust stores strings in
|
|||
|
memory.</p>
|
|||
|
<h4><a class="header" href="#internal-representation" id="internal-representation">Internal Representation</a></h4>
|
|||
|
<p>A <code>String</code> is a wrapper over a <code>Vec<u8></code>. Let’s look at some of our properly
|
|||
|
encoded UTF-8 example strings from Listing 8-14. First, this one:</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let len = String::from("Hola").len();
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p>In this case, <code>len</code> will be 4, which means the vector storing the string “Hola”
|
|||
|
is 4 bytes long. Each of these letters takes 1 byte when encoded in UTF-8. But
|
|||
|
what about the following line? (Note that this string begins with the capital
|
|||
|
Cyrillic letter Ze, not the Arabic number 3.)</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let len = String::from("Здравствуйте").len();
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p>Asked how long the string is, you might say 12. However, Rust’s answer is 24:
|
|||
|
that’s the number of bytes it takes to encode “Здравствуйте” in UTF-8, because
|
|||
|
each Unicode scalar value in that string takes 2 bytes of storage. Therefore,
|
|||
|
an index into the string’s bytes will not always correlate to a valid Unicode
|
|||
|
scalar value. To demonstrate, consider this invalid Rust code:</p>
|
|||
|
<pre><code class="language-rust ignore does_not_compile">let hello = "Здравствуйте";
|
|||
|
let answer = &hello[0];
|
|||
|
</code></pre>
|
|||
|
<p>What should the value of <code>answer</code> be? Should it be <code>З</code>, the first letter? When
|
|||
|
encoded in UTF-8, the first byte of <code>З</code> is <code>208</code> and the second is <code>151</code>, so
|
|||
|
<code>answer</code> should in fact be <code>208</code>, but <code>208</code> is not a valid character on its
|
|||
|
own. Returning <code>208</code> is likely not what a user would want if they asked for the
|
|||
|
first letter of this string; however, that’s the only data that Rust has at
|
|||
|
byte index 0. Users generally don’t want the byte value returned, even if the
|
|||
|
string contains only Latin letters: if <code>&"hello"[0]</code> were valid code that
|
|||
|
returned the byte value, it would return <code>104</code>, not <code>h</code>. To avoid returning an
|
|||
|
unexpected value and causing bugs that might not be discovered immediately,
|
|||
|
Rust doesn’t compile this code at all and prevents misunderstandings early in
|
|||
|
the development process.</p>
|
|||
|
<h4><a class="header" href="#bytes-and-scalar-values-and-grapheme-clusters-oh-my" id="bytes-and-scalar-values-and-grapheme-clusters-oh-my">Bytes and Scalar Values and Grapheme Clusters! Oh My!</a></h4>
|
|||
|
<p>Another point about UTF-8 is that there are actually three relevant ways to
|
|||
|
look at strings from Rust’s perspective: as bytes, scalar values, and grapheme
|
|||
|
clusters (the closest thing to what we would call <em>letters</em>).</p>
|
|||
|
<p>If we look at the Hindi word “नमस्ते” written in the Devanagari script, it is
|
|||
|
stored as a vector of <code>u8</code> values that looks like this:</p>
|
|||
|
<pre><code class="language-text">[224, 164, 168, 224, 164, 174, 224, 164, 184, 224, 165, 141, 224, 164, 164,
|
|||
|
224, 165, 135]
|
|||
|
</code></pre>
|
|||
|
<p>That’s 18 bytes and is how computers ultimately store this data. If we look at
|
|||
|
them as Unicode scalar values, which are what Rust’s <code>char</code> type is, those
|
|||
|
bytes look like this:</p>
|
|||
|
<pre><code class="language-text">['न', 'म', 'स', '्', 'त', 'े']
|
|||
|
</code></pre>
|
|||
|
<p>There are six <code>char</code> values here, but the fourth and sixth are not letters:
|
|||
|
they’re diacritics that don’t make sense on their own. Finally, if we look at
|
|||
|
them as grapheme clusters, we’d get what a person would call the four letters
|
|||
|
that make up the Hindi word:</p>
|
|||
|
<pre><code class="language-text">["न", "म", "स्", "ते"]
|
|||
|
</code></pre>
|
|||
|
<p>Rust provides different ways of interpreting the raw string data that computers
|
|||
|
store so that each program can choose the interpretation it needs, no matter
|
|||
|
what human language the data is in.</p>
|
|||
|
<p>A final reason Rust doesn’t allow us to index into a <code>String</code> to get a
|
|||
|
character is that indexing operations are expected to always take constant time
|
|||
|
(O(1)). But it isn’t possible to guarantee that performance with a <code>String</code>,
|
|||
|
because Rust would have to walk through the contents from the beginning to the
|
|||
|
index to determine how many valid characters there were.</p>
|
|||
|
<h3><a class="header" href="#slicing-strings" id="slicing-strings">Slicing Strings</a></h3>
|
|||
|
<p>Indexing into a string is often a bad idea because it’s not clear what the
|
|||
|
return type of the string-indexing operation should be: a byte value, a
|
|||
|
character, a grapheme cluster, or a string slice. Therefore, Rust asks you to
|
|||
|
be more specific if you really need to use indices to create string slices. To
|
|||
|
be more specific in your indexing and indicate that you want a string slice,
|
|||
|
rather than indexing using <code>[]</code> with a single number, you can use <code>[]</code> with a
|
|||
|
range to create a string slice containing particular bytes:</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>let hello = "Здравствуйте";
|
|||
|
|
|||
|
let s = &hello[0..4];
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p>Here, <code>s</code> will be a <code>&str</code> that contains the first 4 bytes of the string.
|
|||
|
Earlier, we mentioned that each of these characters was 2 bytes, which means
|
|||
|
<code>s</code> will be <code>Зд</code>.</p>
|
|||
|
<p>What would happen if we used <code>&hello[0..1]</code>? The answer: Rust would panic at
|
|||
|
runtime in the same way as if an invalid index were accessed in a vector:</p>
|
|||
|
<pre><code class="language-text">thread 'main' panicked at 'byte index 1 is not a char boundary; it is inside 'З' (bytes 0..2) of `Здравствуйте`', src/libcore/str/mod.rs:2188:4
|
|||
|
</code></pre>
|
|||
|
<p>You should use ranges to create string slices with caution, because doing so
|
|||
|
can crash your program.</p>
|
|||
|
<h3><a class="header" href="#methods-for-iterating-over-strings" id="methods-for-iterating-over-strings">Methods for Iterating Over Strings</a></h3>
|
|||
|
<p>Fortunately, you can access elements in a string in other ways.</p>
|
|||
|
<p>If you need to perform operations on individual Unicode scalar values, the best
|
|||
|
way to do so is to use the <code>chars</code> method. Calling <code>chars</code> on “नमस्ते” separates
|
|||
|
out and returns six values of type <code>char</code>, and you can iterate over the result
|
|||
|
to access each element:</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>for c in "नमस्ते".chars() {
|
|||
|
println!("{}", c);
|
|||
|
}
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p>This code will print the following:</p>
|
|||
|
<pre><code class="language-text">न
|
|||
|
म
|
|||
|
स
|
|||
|
्
|
|||
|
त
|
|||
|
े
|
|||
|
</code></pre>
|
|||
|
<p>The <code>bytes</code> method returns each raw byte, which might be appropriate for your
|
|||
|
domain:</p>
|
|||
|
<pre><pre class="playpen"><code class="language-rust">
|
|||
|
<span class="boring">#![allow(unused_variables)]
|
|||
|
</span><span class="boring">fn main() {
|
|||
|
</span>for b in "नमस्ते".bytes() {
|
|||
|
println!("{}", b);
|
|||
|
}
|
|||
|
<span class="boring">}
|
|||
|
</span></code></pre></pre>
|
|||
|
<p>This code will print the 18 bytes that make up this <code>String</code>:</p>
|
|||
|
<pre><code class="language-text">224
|
|||
|
164
|
|||
|
// --snip--
|
|||
|
165
|
|||
|
135
|
|||
|
</code></pre>
|
|||
|
<p>But be sure to remember that valid Unicode scalar values may be made up of more
|
|||
|
than 1 byte.</p>
|
|||
|
<p>Getting grapheme clusters from strings is complex, so this functionality is not
|
|||
|
provided by the standard library. Crates are available on
|
|||
|
<a href="https://crates.io/">crates.io</a> if this is the functionality you need.</p>
|
|||
|
<h3><a class="header" href="#strings-are-not-so-simple" id="strings-are-not-so-simple">Strings Are Not So Simple</a></h3>
|
|||
|
<p>To summarize, strings are complicated. Different programming languages make
|
|||
|
different choices about how to present this complexity to the programmer. Rust
|
|||
|
has chosen to make the correct handling of <code>String</code> data the default behavior
|
|||
|
for all Rust programs, which means programmers have to put more thought into
|
|||
|
handling UTF-8 data upfront. This trade-off exposes more of the complexity of
|
|||
|
strings than is apparent in other programming languages, but it prevents you
|
|||
|
from having to handle errors involving non-ASCII characters later in your
|
|||
|
development life cycle.</p>
|
|||
|
<p>Let’s switch to something a bit less complex: hash maps!</p>
|
|||
|
|
|||
|
</main>
|
|||
|
|
|||
|
<nav class="nav-wrapper" aria-label="Page navigation">
|
|||
|
<!-- Mobile navigation buttons -->
|
|||
|
|
|||
|
<a rel="prev" href="ch08-01-vectors.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
|
|||
|
<i class="fa fa-angle-left"></i>
|
|||
|
</a>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<a rel="next" href="ch08-03-hash-maps.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
|
|||
|
<i class="fa fa-angle-right"></i>
|
|||
|
</a>
|
|||
|
|
|||
|
|
|||
|
<div style="clear: both"></div>
|
|||
|
</nav>
|
|||
|
</div>
|
|||
|
</div>
|
|||
|
|
|||
|
<nav class="nav-wide-wrapper" aria-label="Page navigation">
|
|||
|
|
|||
|
<a href="ch08-01-vectors.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
|
|||
|
<i class="fa fa-angle-left"></i>
|
|||
|
</a>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<a href="ch08-03-hash-maps.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
|
|||
|
<i class="fa fa-angle-right"></i>
|
|||
|
</a>
|
|||
|
|
|||
|
</nav>
|
|||
|
|
|||
|
</div>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<script type="text/javascript">
|
|||
|
window.playpen_copyable = true;
|
|||
|
</script>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
<script src="elasticlunr.min.js" type="text/javascript" charset="utf-8"></script>
|
|||
|
<script src="mark.min.js" type="text/javascript" charset="utf-8"></script>
|
|||
|
<script src="searcher.js" type="text/javascript" charset="utf-8"></script>
|
|||
|
|
|||
|
|
|||
|
<script src="clipboard.min.js" type="text/javascript" charset="utf-8"></script>
|
|||
|
<script src="highlight.js" type="text/javascript" charset="utf-8"></script>
|
|||
|
<script src="book.js" type="text/javascript" charset="utf-8"></script>
|
|||
|
|
|||
|
<!-- Custom JS scripts -->
|
|||
|
|
|||
|
<script type="text/javascript" src="ferris.js"></script>
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
</body>
|
|||
|
</html>
|