From e94464004c64d4fc82c517cc8aa3398cb563135d Mon Sep 17 00:00:00 2001 From: ravenscroftj Date: Tue, 21 Mar 2023 06:30:16 +0000 Subject: [PATCH] Add 'brainsteam/content/annotations/2023/03/21/1679379947.md' --- .../annotations/2023/03/21/1679379947.md | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 brainsteam/content/annotations/2023/03/21/1679379947.md diff --git a/brainsteam/content/annotations/2023/03/21/1679379947.md b/brainsteam/content/annotations/2023/03/21/1679379947.md new file mode 100644 index 0000000..efae6ed --- /dev/null +++ b/brainsteam/content/annotations/2023/03/21/1679379947.md @@ -0,0 +1,73 @@ +--- +date: '2023-03-21T06:25:47' +hypothesis-meta: + created: '2023-03-21T06:25:47.417575+00:00' + document: + title: + - 'GPT-4 and professional benchmarks: the wrong answer to the wrong question' + flagged: false + group: __world__ + hidden: false + id: N6BVsMexEe2Z4X92AfjYDg + links: + html: https://hypothes.is/a/N6BVsMexEe2Z4X92AfjYDg + incontext: https://hyp.is/N6BVsMexEe2Z4X92AfjYDg/aisnakeoil.substack.com/p/gpt-4-and-professional-benchmarks + json: https://hypothes.is/api/annotations/N6BVsMexEe2Z4X92AfjYDg + permissions: + admin: + - acct:ravenscroftj@hypothes.is + delete: + - acct:ravenscroftj@hypothes.is + read: + - group:__world__ + update: + - acct:ravenscroftj@hypothes.is + tags: + - llm + - openai + - gpt + - ModelEvaluation + target: + - selector: + - endContainer: /div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/article[1]/div[4]/div[1]/div[1]/p[4]/span[2] + endOffset: 300 + startContainer: /div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/article[1]/div[4]/div[1]/div[1]/p[4]/span[1] + startOffset: 0 + type: RangeSelector + - end: 5998 + start: 5517 + type: TextPositionSelector + - exact: "To benchmark GPT-4\u2019s coding ability, OpenAI evaluated it on problems\ + \ from Codeforces, a website that hosts coding competitions. Surprisingly,\ + \ Horace He pointed out that GPT-4 solved 10/10 pre-2021 problems and 0/10\ + \ recent problems in the easy category. The training data cutoff for GPT-4\ + \ is September 2021. This strongly suggests that the model is able to memorize\ + \ solutions from its training set \u2014 or at least partly memorize them,\ + \ enough that it can fill in what it can\u2019t recall." + prefix: 'm 1: training data contamination' + suffix: As further evidence for this hyp + type: TextQuoteSelector + source: https://aisnakeoil.substack.com/p/gpt-4-and-professional-benchmarks + text: OpenAI was only able to pass questions available before september 2021 and + failed to answer new questions - strongly suggesting that it has simply memorised + the answers as part of its training + updated: '2023-03-21T06:26:57.441600+00:00' + uri: https://aisnakeoil.substack.com/p/gpt-4-and-professional-benchmarks + user: acct:ravenscroftj@hypothes.is + user_info: + display_name: James Ravenscroft +in-reply-to: https://aisnakeoil.substack.com/p/gpt-4-and-professional-benchmarks +tags: +- llm +- openai +- gpt +- ModelEvaluation +- hypothesis +type: annotation +url: /annotations/2023/03/21/1679379947 + +--- + + + +
To benchmark GPT-4’s coding ability, OpenAI evaluated it on problems from Codeforces, a website that hosts coding competitions. Surprisingly, Horace He pointed out that GPT-4 solved 10/10 pre-2021 problems and 0/10 recent problems in the easy category. The training data cutoff for GPT-4 is September 2021. This strongly suggests that the model is able to memorize solutions from its training set — or at least partly memorize them, enough that it can fill in what it can’t recall.
OpenAI was only able to pass questions available before september 2021 and failed to answer new questions - strongly suggesting that it has simply memorised the answers as part of its training \ No newline at end of file