All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
Example:
Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"
Output: ["AAAAACCCCC", "CCCCCAAAAA"]
将DNA序列看作是只包含[A, C, G,T]的字符串,给一个DNA字符串,找到所有长度为10且出现超过1次的子串。
解法一:字符串映射
枚举DNA字符串中所有长度为10的子串,将其插入到哈希Map中,并记录子串的数量;遍历哈希Map,将所有出现超过一次的子串存储到输出结果中。算法复杂度为O(n)。
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s)
{
vector<string> result;
if (s.size() < 10)
return result;
map<string, int> log;
for (int i = 0; i <= s.size() -10; i++)
{
string temp = s.substr(i,10);
log[temp]++;
}
for (auto iter : log)
{
if (iter.second > 1)
result.push_back(iter.first);
}
return result;
}
};
解法一:编码后映射
将长度为10的DNA序列进行整数编码,[A, C, G,T]4个字符分别用[00,01,10,11]表示,长度为10的DNA序列可以用20个比特位整数表示。
(1).设置全局整数哈希表,g_map[1048576] = 2^20,表示所有长度为10的DNA序列。
(2).将DNA字符串的前10个字符,使用左移运算转换为整数key,g_map[key]++。
(3).从DNA的第11个字符开始,按照顺序遍历各个字符,遇到1个字符即将key右移2位(去掉最低位)。并将新的DNA字符s[i]转换为整数后,或到最高位(增加最到位),g_map[key]++。
(4).遍历哈希表g_map,若g_map[i]>1,从低到高位转换为10个DNA序列,放入结果数组。
class Solution {
private:
int g_map[1048576] = {0};
string changeToStr(int num)
{
static const char DNA_CHAR[] = {'A', 'C', 'G', 'T'};
string str;
for (int i = 0; i < 10; i++)
{
str += DNA_CHAR[num & 3];
num = num >>2;
}
return str;
}
public:
vector<string> findRepeatedDnaSequences(string s)
{
vector<string> result;
if (s.size() < 10)
return result;
map<char, int> strMap;
strMap['A'] = 0;
strMap['C'] = 1;
strMap['G'] = 2;
strMap['T'] = 3;
int key = 0;
for (int i = 9; i >=0; i--)
{
key = strMap[s[i]] + (key<< 2);
}
g_map[key] = 1;
for (int i = 10; i < s.size(); i++)
{
//key右移2位(去掉最低位),strMap[s[i]左移18位(增加最高位)
key = (key >> 2) | (strMap[s[i]] << 18);
g_map[key]++;
}
for (int i = 0; i < 1048576; i++)
{
if (g_map[i] > 1)
{
result.push_back(changeToStr(i));
}
}
return result;
}
};