From 78b2483fad8ca5233b0d3882100d6f46b00c6717 Mon Sep 17 00:00:00 2001 From: ravenscroftj Date: Mon, 19 Dec 2022 15:00:03 +0000 Subject: [PATCH] Add 'brainsteam/content/annotations/2022/12/19/1671461885.md' --- .../annotations/2022/12/19/1671461885.md | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 brainsteam/content/annotations/2022/12/19/1671461885.md diff --git a/brainsteam/content/annotations/2022/12/19/1671461885.md b/brainsteam/content/annotations/2022/12/19/1671461885.md new file mode 100644 index 0000000..d0167ef --- /dev/null +++ b/brainsteam/content/annotations/2022/12/19/1671461885.md @@ -0,0 +1,71 @@ +--- +date: '2022-12-19T14:58:05' +hypothesis-meta: + created: '2022-12-19T14:58:05.006973+00:00' + document: + title: + - My AI Safety Lecture for UT Effective Altruism + flagged: false + group: __world__ + hidden: false + id: iqqNRH-tEe2fKTMGgQumvA + links: + html: https://hypothes.is/a/iqqNRH-tEe2fKTMGgQumvA + incontext: https://hyp.is/iqqNRH-tEe2fKTMGgQumvA/scottaaronson.blog/?p=6823 + json: https://hypothes.is/api/annotations/iqqNRH-tEe2fKTMGgQumvA + permissions: + admin: + - acct:ravenscroftj@hypothes.is + delete: + - acct:ravenscroftj@hypothes.is + read: + - group:__world__ + update: + - acct:ravenscroftj@hypothes.is + tags: + - explainability + - nlproc + target: + - selector: + - endContainer: /div[2]/div[2]/div[2]/div[1]/p[100] + endOffset: 429 + startContainer: /div[2]/div[2]/div[2]/div[1]/p[100] + startOffset: 0 + type: RangeSelector + - end: 41343 + start: 40914 + type: TextPositionSelector + - exact: "Now, this can all be defeated with enough effort. For example, if you\ + \ used another AI to paraphrase GPT\u2019s output\u2014well okay, we\u2019\ + re not going to be able to detect that. On the other hand, if you just insert\ + \ or delete a few words here and there, or rearrange the order of some sentences,\ + \ the watermarking signal will still be there. Because it depends only on\ + \ a sum over n-grams, it\u2019s robust against those sorts of interventions." + prefix: "which parts probably didn\u2019t.\n\n\n\n" + suffix: ' + + + + + The hope is that this can be' + type: TextQuoteSelector + source: https://scottaaronson.blog/?p=6823 + text: this mechanism can be defeated by paraphrasing the output with another model + updated: '2022-12-19T14:58:05.006973+00:00' + uri: https://scottaaronson.blog/?p=6823 + user: acct:ravenscroftj@hypothes.is + user_info: + display_name: James Ravenscroft +in-reply-to: https://scottaaronson.blog/?p=6823 +tags: +- explainability +- nlproc +- hypothesis +type: annotation +url: /annotations/2022/12/19/1671461885 + +--- + + + +
Now, this can all be defeated with enough effort. For example, if you used another AI to paraphrase GPT’s output—well okay, we’re not going to be able to detect that. On the other hand, if you just insert or delete a few words here and there, or rearrange the order of some sentences, the watermarking signal will still be there. Because it depends only on a sum over n-grams, it’s robust against those sorts of interventions.
this mechanism can be defeated by paraphrasing the output with another model \ No newline at end of file