From e94464004c64d4fc82c517cc8aa3398cb563135d Mon Sep 17 00:00:00 2001
From: ravenscroftj <ravenscroftj@gmail.com>
Date: Tue, 21 Mar 2023 06:30:16 +0000
Subject: [PATCH] Add 'brainsteam/content/annotations/2023/03/21/1679379947.md'

---
 .../annotations/2023/03/21/1679379947.md      | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 brainsteam/content/annotations/2023/03/21/1679379947.md
diff --git a/brainsteam/content/annotations/2023/03/21/1679379947.md b/brainsteam/content/annotations/2023/03/21/1679379947.md
new file mode 100644
index 0000000..efae6ed
--- /dev/null
+++ b/brainsteam/content/annotations/2023/03/21/1679379947.md
@@ -0,0 +1,73 @@
+---
+date: '2023-03-21T06:25:47'
+hypothesis-meta:
+  created: '2023-03-21T06:25:47.417575+00:00'
+  document:
+    title:
+    - 'GPT-4 and professional benchmarks: the wrong answer to the wrong question'
+  flagged: false
+  group: __world__
+  hidden: false
+  id: N6BVsMexEe2Z4X92AfjYDg
+  links:
+    html: https://hypothes.is/a/N6BVsMexEe2Z4X92AfjYDg
+    incontext: https://hyp.is/N6BVsMexEe2Z4X92AfjYDg/aisnakeoil.substack.com/p/gpt-4-and-professional-benchmarks
+    json: https://hypothes.is/api/annotations/N6BVsMexEe2Z4X92AfjYDg
+  permissions:
+    admin:
+    - acct:ravenscroftj@hypothes.is
+    delete:
+    - acct:ravenscroftj@hypothes.is
+    read:
+    - group:__world__
+    update:
+    - acct:ravenscroftj@hypothes.is
+  tags:
+  - llm
+  - openai
+  - gpt
+  - ModelEvaluation
+  target:
+  - selector:
+    - endContainer: /div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/article[1]/div[4]/div[1]/div[1]/p[4]/span[2]
+      endOffset: 300
+      startContainer: /div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/article[1]/div[4]/div[1]/div[1]/p[4]/span[1]
+      startOffset: 0
+      type: RangeSelector
+    - end: 5998
+      start: 5517
+      type: TextPositionSelector
+    - exact: "To benchmark GPT-4\u2019s coding ability, OpenAI evaluated it on problems\
+        \ from Codeforces, a website that hosts coding competitions. Surprisingly,\
+        \ Horace He pointed out that GPT-4 solved 10/10 pre-2021 problems and 0/10\
+        \ recent problems in the easy category. The training data cutoff for GPT-4\
+        \ is September 2021. This strongly suggests that the model is able to memorize\
+        \ solutions from its training set \u2014 or at least partly memorize them,\
+        \ enough that it can fill in what it can\u2019t recall."
+      prefix: 'm 1: training data contamination'
+      suffix: As further evidence for this hyp
+      type: TextQuoteSelector
+    source: https://aisnakeoil.substack.com/p/gpt-4-and-professional-benchmarks
+  text: OpenAI was only able to pass questions available before september 2021 and
+    failed to answer new questions - strongly suggesting that it has simply memorised
+    the answers as part of its training
+  updated: '2023-03-21T06:26:57.441600+00:00'
+  uri: https://aisnakeoil.substack.com/p/gpt-4-and-professional-benchmarks
+  user: acct:ravenscroftj@hypothes.is
+  user_info:
+    display_name: James Ravenscroft
+in-reply-to: https://aisnakeoil.substack.com/p/gpt-4-and-professional-benchmarks
+tags:
+- llm
+- openai
+- gpt
+- ModelEvaluation
+- hypothesis
+type: annotation
+url: /annotations/2023/03/21/1679379947
+
+---
+
+
+
+ <blockquote>To benchmark GPT-4’s coding ability, OpenAI evaluated it on problems from Codeforces, a website that hosts coding competitions. Surprisingly, Horace He pointed out that GPT-4 solved 10/10 pre-2021 problems and 0/10 recent problems in the easy category. The training data cutoff for GPT-4 is September 2021. This strongly suggests that the model is able to memorize solutions from its training set — or at least partly memorize them, enough that it can fill in what it can’t recall.</blockquote>OpenAI was only able to pass questions available before september 2021 and failed to answer new questions - strongly suggesting that it has simply memorised the answers as part of its training
\ No newline at end of file