From 507556d60af91da519c3a8fa0aeb43ec149fb1d9 Mon Sep 17 00:00:00 2001 From: ravenscroftj Date: Mon, 19 Dec 2022 15:00:13 +0000 Subject: [PATCH] Add 'brainsteam/content/annotations/2022/12/19/1671461409.md' --- .../annotations/2022/12/19/1671461409.md | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 brainsteam/content/annotations/2022/12/19/1671461409.md diff --git a/brainsteam/content/annotations/2022/12/19/1671461409.md b/brainsteam/content/annotations/2022/12/19/1671461409.md new file mode 100644 index 0000000..2aa3f20 --- /dev/null +++ b/brainsteam/content/annotations/2022/12/19/1671461409.md @@ -0,0 +1,68 @@ +--- +date: '2022-12-19T14:50:09' +hypothesis-meta: + created: '2022-12-19T14:50:09.008193+00:00' + document: + title: + - My AI Safety Lecture for UT Effective Altruism + flagged: false + group: __world__ + hidden: false + id: bvVepH-sEe2uPgfvTF7V-w + links: + html: https://hypothes.is/a/bvVepH-sEe2uPgfvTF7V-w + incontext: https://hyp.is/bvVepH-sEe2uPgfvTF7V-w/scottaaronson.blog/?p=6823 + json: https://hypothes.is/api/annotations/bvVepH-sEe2uPgfvTF7V-w + permissions: + admin: + - acct:ravenscroftj@hypothes.is + delete: + - acct:ravenscroftj@hypothes.is + read: + - group:__world__ + update: + - acct:ravenscroftj@hypothes.is + tags: + - explainability + - nlproc + target: + - selector: + - endContainer: /div[2]/div[2]/div[2]/div[1]/p[72] + endOffset: 437 + startContainer: /div[2]/div[2]/div[2]/div[1]/p[72] + startOffset: 10 + type: RangeSelector + - end: 29171 + start: 28744 + type: TextPositionSelector + - exact: " Eventually GPT will say, \u201Coh, I know what game we\u2019re playing!\ + \ it\u2019s the \u2018give false answers\u2019 game!\u201D And it will then\ + \ continue playing that game and give you more false answers. What the new\ + \ paper shows is that, in such cases, one can actually look at the inner layers\ + \ of the neural net and find where it has an internal representation of what\ + \ was the true answer, which then gets overridden once you get to the output\ + \ layer." + prefix: "Does 2+2=4? No.\u201D\n\n\n\n\nand so on." + suffix: "\n\n\n\nTo be clear, there\u2019s no know" + type: TextQuoteSelector + source: https://scottaaronson.blog/?p=6823 + text: this is fascinating - GPT learns the true answer to a question but will ignore + it and let the user override this in later layers of the model + updated: '2022-12-19T14:50:09.008193+00:00' + uri: https://scottaaronson.blog/?p=6823 + user: acct:ravenscroftj@hypothes.is + user_info: + display_name: James Ravenscroft +in-reply-to: https://scottaaronson.blog/?p=6823 +tags: +- explainability +- nlproc +- hypothesis +type: annotation +url: /annotations/2022/12/19/1671461409 + +--- + + + +
Eventually GPT will say, “oh, I know what game we’re playing! it’s the ‘give false answers’ game!” And it will then continue playing that game and give you more false answers. What the new paper shows is that, in such cases, one can actually look at the inner layers of the neural net and find where it has an internal representation of what was the true answer, which then gets overridden once you get to the output layer.
this is fascinating - GPT learns the true answer to a question but will ignore it and let the user override this in later layers of the model \ No newline at end of file