# Watermarks from JVCI's synthetic bacterium
# See Figure S1 in http://www.sciencemag.org/content/suppl/2010/05/18/science.1190719.DC1/Gibson.SOM.pdf
wm1 = "GTTCGAATATTTCTATAGCTGTACATATTGTAATGCTGATAACTAATACTGTGCGCTTGACTGTGATCCTGATAAATAACTTCTTCTGTAGGGTAGAGTTTTATTTAAGGCTACTCACTGGTTGCAAACCAATGCCGTACATTACTAGCTTGATCCTTGGTCGGTCATTGGGGGATATCTCTTACTAATAGAGCGGCCTATCGCGTATTCTCGCCGGACCCCCCTCTCCCACACCAGCGGTGTAGCATCACCAAGAAAATGAGGGGAACGGATGAGGAACGAGTGGGGGCTCATTGCTGATCATAATGACTGTTTATATACTAATGCCGTCAACTGTTTGCTGTGATACTGTGCTTTCGAGGGCGGGAGATTCGTTTTTGACATACATAAATATCATGACAAAACAGCCGGTCATGACAAAACAGCCGGTCATAATAGATTAGCCGGTGACTGTGAAACTAAAGCTACTAATGCCGTCAATAAATATGATAATAGCAACGGCACTGACTGTGAAACTAAAGCCGGCACTCATAATAGATTAGCCGGAGTCGTATTCATAGCCGGTAGATATCACTATAAGGCCCAGGATCATGATGAACACAGCACCACGTCGTCGTCCGAGTTTTTTTGCTGCGACGTCTATACCACGGAAGCTGATCATAAATAGTTTTTTTGCTGCGGCACTAGAGCCGGACAAGCACACTACGTTTGTAAATACATCGTTCCGAATTGTAAATAATTTAATTTCGTATTTAAATTATATGATCACTGGCTATAGTCTAGTGATAACTACAATAGCTAGCAATAAGTCATATATAACAATAGCTGAACCTGTGCTACATATCCGCTATACGGTAGATATCACTATAAGGCCCAGGACAATAGCTGAACTGACGTCAGCAACTACGTTTAGCTTGACTGTGGTCGGTTTTTTTGCTGCGACGTCTATACGGAAGCTCATAACTATAAGAGCGGCACTAGAGCCGGCACACAAGCCGGCACAGTCGTATTCATAGCCGGCACTCATGACAAAACAGC"
wm2 = "CAACTGGCAGCATAAAACATATAGAACTACCTGCTATAAGTGATACAACTGTTTTCATAGTAAAACATACAACGTTGCTGATAGTACTCCTAAGTGATAGCTTAGTGCGTTTAGCATATATTGTAGGCTTCATAATAAGTGATATTTTAGCTACGTAACTAAATAAACTAGCTATGACTGTACTCCTAAGTGATATTTTCATCCTTTGCAATACAATAACTACTACATCAATAGTGCGTGATATGCCTGTGCTAGATATAGAACACATAACTACGTTTGCTGTTTTCAGTGATATGCTAGTTTCATCTATAGATATAGGCTGCTTAGATTCCCTACTAGCTATTTCTGTAGGTGATATACGTCCATTGCATAAGTTAATGCATTTAACTAGCTGTGATACTATAGCATCCCCATTCCTAGTGCATATTTTCATCCTAGTGCTACGTGATATAATTGTACTAATGCCTGTAGATAATTTAATGCCTGGCTCGTTTGTAGGTGATAATTTAGTGCCTGTAAAACATATACCTGAGTGCTCGTTGCGTGATAGTTCGTTCATGCATATACAACTAGGCTGCTGTGATATGGTCACTGCCCTTACTGTGCTACATATTACTGCGAGGGGGATGACGTATAAACCTGTTGTAAGTGATATGACGTATATAACTACTAGTGATATGACGTATAGGCTAGAACAACGTGATATGACGTATATGACTACTGTCCCAAACATCAGTGATATGACGTATACTATAATTTCTATAATAGTGATAAATAAACCTGGGCTAAATACGTTCCTGAATACGTGGCATAAACCTGGGCTAACGAGGAATACCCATAGTTTAGCAATAAGCTATAGTTCGTCATTTTTAA"
wm3 = "TTTAACCATATTTAAATATCATCCTGATTTTCACTGGCTCGTTGCGTGATATAGATTCTACTGTAGTGCTAGATAGTTCTGTACTAGGTGATACTATAGATTTCATAGATAGCACTACTGGCTTCATGCTAGGCATCCCAATAGCTAGTGATAGTTTAGTGCATACAACGTCATGTGATACAACGTTGCTGGCTGTAGATACAACGTCGTATTCTGTAAGTGATACAATAGCTATTGCTGTGCATAGGCCTATAGTGGCTGTAACTAGTGATATCACGTAACAACCATATAAGTTAGATTTAATGCCCCTGACTGAACGCTCGTTGCGTGATAGTTTAGGCTCGTTGCATACAACTGTGATTTTCATAAAACAACGTGATAATTTAGTGCTAGATAAGTTCCGCTTAGCAAGTGATAGTTTCCGCTTGACTGTGCATAGTTCGTTCATGCGCTCGTTGCGTGATAAACTAGGCAGCTTCACAACTGATAATTTAATTGCTGATATTGCTGGCTGTCTAGTGCTAGTGATCATAGTGCGTGATAGTTTAAGCTGCTCTGTTTTAGATATCACGTGCTTGATAATGAAACTAACTAGTGATACTACGTAGTTAACTATGAATAGGCCTACTGTAAATTCAATAGTGCGTGATATTGAACTAGATTCTGCAACTGCTAATATGCCGTGCTGCACGTTTGGTGATAGTTTAGCATGCTTCACTATAATAAATATGGTAGTTGTAACTACTGCGAATAGGGGGAGCTTAATAAATATGATCACTGTGCTACGCTATATGCCGTTGAATATAGGCTATATGATCATAACATATATAGCTATAAGTGATAAGTTCCTGAATATAGGCTATATGATCATAACATATACAACTGTACTCATGAATAAGTTAACGAGGA"
wm4 = "TTTCATTGCTGATCACTGTAGATATAGTGCATTCTATAAGTCGCTCCCACAGGCTAGTGCTGCGCACGTTTTTCAGTGATATTATCCTAGTGCTACATAACATCATAGTGCGTGATAAACCTGATACAATAGGTGATATCATAGCAACTGAACTGACGTTGCATAGCTCAACTGTGATCAGTGATATAGATTCTGATACTATAGCAACGTTGCGTGATATTTTCACTACTGGCTTGACTGTAGTGCATATGATAGTACGTCTAACTAGCATAACTAGTGATAGTTATATTTCTATAGCTGTACATATTGTAATGCTGATAACTAGTGATATAATCCAACTAGATAGTCCTGAACTGATCCCTATGCTAACTAGTGATAAACTAACTGATACATCGTTCCTGCTACGTGATAGCTTCACTGAGTTCCATACATCGTCGTGCTTAAACATCAGTGATAACACTATAGAGTTCATAGATACTGCATTAACTAGTGATATGACTGCAAATAGCTTGACGTTTTGCAGTCTAAAACAACGTGATAATTCTGTAGTGCTAGATACTATAGATTTCCTGCTAAGTGATAAGTCTACTGATTTACTAATGAATAGCTTGGTTTTGGCATACACTGTGCGCTGCACTGGTGATAGCTTTTCGTTGATGAATAATTTCCCTAGCACTGTGCGTGATATGCTAGATTCTGTAGATAGGCTAAATTCGTCTACGTTTGTAGGTGATAGTTTAGTTGCTGTAACTAATATTATCCCTGTGCCGTTGCTAAGCTGTGATATCATAGTGCTGCTAGATATGATAAGCAAACTAATAGAGTCGAGGGGGAGTCTCATAGTGAATACTGATATTTTAGTGCTGCCGTTGAATAAGTTCCCTGAACATTGTGATACTGATATTTTAGTGCTGCCGTTGAATATCCTGCATTTAACTAGCTTGATAGTGCATTCGAGGAATACCCATACTACTGTTTTCATAGCTAATTATAGGCTAACATTGCCAATAGTGC"
wm_list = [ wm1, wm2, wm3, wm4 ]
hint_list = [
"TO LIVE, TO ERR, TO FALL, TO TRIUMPH, TO RECREATE LIFE OUT OF LIFE.",
"SEE THINGS NOT AS THEY ARE, BUT AS THEY MIGHT BE.",
"WHAT I CANNOT BUILD, I CANNOT UNDERSTAND." ,
# If we add the hints below, the results look better.
# These hints are guesses made after looking at the initial
# results generated using only the hints above.
# "ABCDEFGHIJKLMNOPQRSTUVWXYZ\n 0123456789",
# "<A HREF=\"MAILTO:MROQSTIZ@JCVI.ORG\">HERE!</A>",
]
# These are guesses for individual triplet-to-letter maps
# that are difficult to incorporate into hints (without
# making the hints so large as to make the exercise pointless).
known = {
# "CCC":"-",
# "GAA":"'"
}
# This a much simpler test case of "watermarks" and "hints"
# where we know the right answer
if 0:
wm_list = [ "AAABBBAAAAAABBB" ]
hint_list = [ "XYXX" ]
# Assumptions:
# Letters are encoded by DNA triplets
# Only uppercase letters are used (from the hints, may not be correct)
# Simple substitution cipher
# Approach:
# Each hint defines a certain pattern of appearance for letters.
# If we find the same pattern for triplets in the watermarks, we
# can create a triplet-to-letter mapping. There may be multiple
# mappings that satisfy the hints, so we want to construct all
# mappings from all hints and then combine them if they are
# consistent with each other. Finally, we decode the watermarks
# using the combined mapping.
# Problems:
# (a) There may be more than one mapping that satisfy the hint
# patterns. In this case, we will have multiple "plaintext" and
# we have to figure out which one is correct by comparing them.
# (b) There will be missing mappings, ie triplets for which we
# do not have a corresponding letter. This is because we only have
# a subset of letters in the hints. For example, J and K do not
# appear in any hints, so there is no direct way to identify the
# triplet patterns that correspond to J and K. However, we can
# examine the first-round results and guess where certain letters
# should be; for example, if we see ABCDEFGHI__LMNOPQ, we can be
# fairly certain that the missing letters are J and K. We can then
# expand the set of hints by including these guessed patterns; in
# the above example, we would add a hint of ABCDEFGHIJKLMNOPQRSTUVWXYZ.
# Note that any unmapped triplet that only appear once in the
# watermarks cannot be mapped algorithmically; we can only guess
# at what it is likely to be based on patterns that we recognize
# using contexts other than the hints.
#
def main():
# Get possible mappings
letter2triplet_mappings = get_mapping(hint_list, wm_list)
# Decode watermarks using each possible mapping
for letter2triplet in letter2triplet_mappings:
triplet2letter = invert_mapping(letter2triplet)
triplet2letter.update(known.items())
for wm in wm_list:
for r in apply_mapping(triplet2letter, wm):
if r:
print(r)
print()
def get_mapping(hint_list, wm_list):
# Loop through hints to get the possible mappings,
# combining possibilities from all hints.
mappings = None
for hint in hint_list:
m = process_hint(hint, wm_list)
if mappings is None:
mappings = m
else:
mappings = reconcile(mappings, m)
# Check whether results are reasonable
if len(mappings) == 0:
print("ERROR: cannot find any suitable mapping")
elif len(mappings) > 1:
print("WARNING: %d mappings found" % len(m))
return mappings
def process_hint(hint, wm_list):
# Use the pattern defined by "hint" and apply it to each
# watermark in "wm_list" to see if we can generate
# triplet-to-letter mappings.
mappings = list()
for wm in wm_list:
mappings.extend(find_mappings(build_pattern(hint), hint, wm))
# If the hint appears twice in a watermark, or it appears
# in multiple watermarks, we will generate a mapping for
# each time it appears. If multiple mappings are identical,
# we only need one copy.
mappings = unique_mappings(mappings)
return mappings
def unique_mappings(orig_mappings):
# Remove any duplicate mappings in the list
mappings = list()
for om in orig_mappings:
for m in mappings:
if om == m:
break
else:
mappings.append(om)
return mappings
def build_pattern(hint):
# A pattern is a list of indices of the same length as the hint.
# The value of the pattern at index i is the index
# in "hint" where hint[i] last occurs. For example, "AAB"
# has a pattern of [ 1, 1, 2 ]. When we want to see if a
# string matches our pattern, we check if string[i] ==
# string[pattern[i]]. (There is a hole in this logic where
# do not enforce that two different letters must _not_ have
# the same mapping, but the mapping reconciliation code should
# catch that and eliminate the bad mapping.)
letters = dict( [ (letter, i)
for i, letter in enumerate(hint) ] )
pattern = [ letters[letter] for letter in hint ]
return pattern
def find_mappings(pattern, hint, wm):
# Dividing the watermark into triplets, find triplets that
# exhibit the sequence defined by "pattern"
mappings = list()
unique_indices = set(pattern)
triplet_length = len(pattern) * 3
for i in range(0, len(wm) - triplet_length + 1):
if pattern_match(pattern, wm, i):
mappings.append(mapping(unique_indices, hint, wm, i))
return mappings
def triplet(wm, start, i):
# Return the triplet in "watermark" at position "i" when
# the pattern begins at "start".
wm_i = start + i * 3
return wm[wm_i:wm_i+3]
def pattern_match(pattern, wm, start):
# Get len(pattern) triplets from wm starting at offset "start"
# and see if they match pattern. (See comments in "build_pattern"
# above for how pattern matching occurs.)
for i, first_i in enumerate(pattern):
if first_i == i:
# Letter always equal to itself
continue
if triplet(wm, start, i) != triplet(wm, start, first_i):
return False
return True
def mapping(unique_indices, hint, wm, start):
# Assuming that the pattern for "hint" matches watermark
# "wm" at offset "start", create the letter-to-triplet
# mapping. Only indices in "unique_indices" need to be
# processed because all other indices are redundant.
return dict( [ (hint[i], triplet(wm, start, i))
for i in unique_indices ] )
def reconcile(mappings1, mappings2):
# Each group of mappings consist of letter-to-triplet mappings
# Take all pairs of mapping, one from each group, and check
# whether they are compatible. If so, create and save a
# combined mapping.
mappings = list()
for m1 in mappings1:
for m2 in mappings2:
m = combine(m1, m2)
if m:
mappings.append(m)
return unique_mappings(mappings)
def combine(m1, m2):
# Combine two mappings. If any letter appears in both mappings
# and map to different triplets, then we return None indicating
# that the mappings are incompatible. Otherwise, we create
# a single mapping that includes the all the correspondences
# from both mappings.
m = dict()
for k in list(m1.keys()) + list(m2.keys()):
try:
v1 = m1[k]
except KeyError:
m[k] = m2[k]
continue
try:
v2 = m2[k]
except KeyError:
m[k] = m1[k]
continue
if v1 != v2:
return None
m[k] = v1
return m
def invert_mapping(m):
# Invert the mapping, making the values into keys and vice
# versa. This assumes that the mapping is one-to-one.
return dict( [ (v, k) for k, v in m.items() ] )
def apply_mapping(m, wm):
# Assume there is no frame shift and simply apply the
# triplet-to-letter mapping "m" to watermark "wm"
return [ substitute(m, wm) ]
# UNUSED: We can handle frame shifts by returning multiple
# "plaintext" for each watermark.
# Return the three possible substitution results
#return [ substitute(m, wm, offset) for offset in range(3) ]
def substitute(m, wm, offset=0):
# Apply the correspondences in mapping "m" to watermark
# "wm" starting at offset "offset".
letters = list()
bad = 0
for i in range(offset, len(wm) - 2, 3):
triplet = wm[i:i+3]
try:
letters.append(m[triplet])
except KeyError:
letters.append('[%s]' % triplet)
bad = bad + 1
# If the number of untranslated triplets exceeds a certain
# threshold, we assume the mapping is incorrect and do not
# bother to return any plaintext
if bad > len(wm) / 3 * 0.4:
return None
else:
return ''.join(letters)
if __name__ == "__main__":
# Run the program if we were invoked as a script
main()